diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py index b781c4a8e..d52b0dca7 100644 --- a/paddlespeech/__init__.py +++ b/paddlespeech/__init__.py @@ -14,3 +14,9 @@ import _locale _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) + + + + + + diff --git a/paddlespeech/audio/__init__.py b/paddlespeech/audio/__init__.py index a91958105..83be8e32e 100644 --- a/paddlespeech/audio/__init__.py +++ b/paddlespeech/audio/__init__.py @@ -14,12 +14,12 @@ from . import compliance from . import datasets from . import features +from . import text +from . import transform +from . import streamdata from . import functional from . import io from . import metric from . import sox_effects -from . import streamdata -from . import text -from . import transform from .backends import load from .backends import save diff --git a/paddlespeech/audio/streamdata/__init__.py b/paddlespeech/audio/streamdata/__init__.py index 47a2e79b3..753fcc11b 100644 --- a/paddlespeech/audio/streamdata/__init__.py +++ b/paddlespeech/audio/streamdata/__init__.py @@ -4,66 +4,67 @@ # Modified from https://github.com/webdataset/webdataset # # flake8: noqa -from .cache import cached_tarfile_samples -from .cache import cached_tarfile_to_samples -from .cache import lru_cleanup -from .cache import pipe_cleaner -from .compat import FluidWrapper -from .compat import WebDataset -from .compat import WebLoader -from .extradatasets import MockDataset -from .extradatasets import with_epoch -from .extradatasets import with_length -from .filters import associate -from .filters import audio_cmvn -from .filters import audio_compute_fbank -from .filters import audio_data_filter -from .filters import audio_padding -from .filters import audio_resample -from .filters import audio_spec_aug -from .filters import audio_tokenize -from .filters import batched -from .filters import decode -from .filters import detshuffle -from .filters import extract_keys -from .filters import getfirst -from .filters import info -from .filters import map -from .filters import map_dict -from .filters import map_tuple -from .filters import pipelinefilter -from .filters import placeholder -from .filters import rename -from .filters import rename_keys -from .filters import select -from .filters import shuffle -from .filters import slice -from .filters import sort -from .filters import to_tuple -from .filters import transform_with -from .filters import unbatched -from .filters import xdecode -from .handlers import ignore_and_continue -from .handlers import ignore_and_stop -from .handlers import reraise_exception -from .handlers import warn_and_continue -from .handlers import warn_and_stop -from .mix import RandomMix -from .mix import RoundRobin + +from .cache import ( + cached_tarfile_samples, + cached_tarfile_to_samples, + lru_cleanup, + pipe_cleaner, +) +from .compat import WebDataset, WebLoader, FluidWrapper +from .extradatasets import MockDataset, with_epoch, with_length +from .filters import ( + associate, + batched, + decode, + detshuffle, + extract_keys, + getfirst, + info, + map, + map_dict, + map_tuple, + pipelinefilter, + rename, + rename_keys, + audio_resample, + select, + shuffle, + slice, + to_tuple, + transform_with, + unbatched, + xdecode, + audio_data_filter, + audio_tokenize, + audio_resample, + audio_compute_fbank, + audio_spec_aug, + sort, + audio_padding, + audio_cmvn, + placeholder, +) +from .handlers import ( + ignore_and_continue, + ignore_and_stop, + reraise_exception, + warn_and_continue, + warn_and_stop, +) from .pipeline import DataPipeline -from .shardlists import MultiShardSample -from .shardlists import non_empty -from .shardlists import resampled -from .shardlists import ResampledShards -from .shardlists import shardspec -from .shardlists import SimpleShardList -from .shardlists import single_node_only -from .shardlists import split_by_node -from .shardlists import split_by_worker -from .tariterators import tarfile_samples -from .tariterators import tarfile_to_samples -from .utils import PipelineStage -from .utils import repeatedly -from .writer import numpy_dumps -from .writer import ShardWriter -from .writer import TarWriter +from .shardlists import ( + MultiShardSample, + ResampledShards, + SimpleShardList, + non_empty, + resampled, + shardspec, + single_node_only, + split_by_node, + split_by_worker, +) +from .tariterators import tarfile_samples, tarfile_to_samples +from .utils import PipelineStage, repeatedly +from .writer import ShardWriter, TarWriter, numpy_dumps +from .mix import RandomMix, RoundRobin diff --git a/paddlespeech/audio/streamdata/autodecode.py b/paddlespeech/audio/streamdata/autodecode.py index d7f7937bd..ca0e2ea2f 100644 --- a/paddlespeech/audio/streamdata/autodecode.py +++ b/paddlespeech/audio/streamdata/autodecode.py @@ -5,19 +5,18 @@ # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset # + """Automatically decode webdataset samples.""" -import io -import json -import os -import pickle -import re -import tempfile + +import io, json, os, pickle, re, tempfile from functools import partial import numpy as np + """Extensions passed on to the image decoder.""" image_extensions = "jpg jpeg png ppm pgm pbm pnm".split() + ################################################################ # handle basic datatypes ################################################################ @@ -129,7 +128,7 @@ def call_extension_handler(key, data, f, extensions): target = target.split(".") if len(target) > len(extension): continue - if extension[-len(target):] == target: + if extension[-len(target) :] == target: return f(data) return None @@ -269,6 +268,7 @@ def imagehandler(imagespec, extensions=image_extensions): ################################################################ # torch video ################################################################ + ''' def torch_video(key, data): """Decode video using the torchvideo library. @@ -289,6 +289,7 @@ def torch_video(key, data): return torchvision.io.read_video(fname, pts_unit="sec") ''' + ################################################################ # paddlespeech.audio ################################################################ @@ -358,6 +359,7 @@ def gzfilter(key, data): # decode entire training amples ################################################################ + default_pre_handlers = [gzfilter] default_post_handlers = [basichandlers] @@ -385,8 +387,7 @@ class Decoder: pre = default_pre_handlers if post is None: post = default_post_handlers - assert all(callable(h) - for h in handlers), f"one of {handlers} not callable" + assert all(callable(h) for h in handlers), f"one of {handlers} not callable" assert all(callable(h) for h in pre), f"one of {pre} not callable" assert all(callable(h) for h in post), f"one of {post} not callable" self.handlers = pre + handlers + post diff --git a/paddlespeech/audio/streamdata/cache.py b/paddlespeech/audio/streamdata/cache.py index faa196398..e7bbffa1b 100644 --- a/paddlespeech/audio/streamdata/cache.py +++ b/paddlespeech/audio/streamdata/cache.py @@ -2,10 +2,7 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset -import os -import random -import re -import sys +import itertools, os, random, re, sys from urllib.parse import urlparse from . import filters @@ -43,7 +40,7 @@ def lru_cleanup(cache_dir, cache_size, keyfn=os.path.getctime, verbose=False): os.remove(fname) -def download(url, dest, chunk_size=1024**2, verbose=False): +def download(url, dest, chunk_size=1024 ** 2, verbose=False): """Download a file from `url` to `dest`.""" temp = dest + f".temp{os.getpid()}" with gopen.gopen(url) as stream: @@ -68,11 +65,12 @@ def pipe_cleaner(spec): def get_file_cached( - spec, - cache_size=-1, - cache_dir=None, - url_to_name=pipe_cleaner, - verbose=False, ): + spec, + cache_size=-1, + cache_dir=None, + url_to_name=pipe_cleaner, + verbose=False, +): if cache_size == -1: cache_size = default_cache_size if cache_dir is None: @@ -109,14 +107,15 @@ verbose_cache = int(os.environ.get("WDS_VERBOSE_CACHE", "0")) def cached_url_opener( - data, - handler=reraise_exception, - cache_size=-1, - cache_dir=None, - url_to_name=pipe_cleaner, - validator=check_tar_format, - verbose=False, - always=False, ): + data, + handler=reraise_exception, + cache_size=-1, + cache_dir=None, + url_to_name=pipe_cleaner, + validator=check_tar_format, + verbose=False, + always=False, +): """Given a stream of url names (packaged in `dict(url=url)`), yield opened streams.""" verbose = verbose or verbose_cache for sample in data: @@ -133,7 +132,8 @@ def cached_url_opener( cache_size=cache_size, cache_dir=cache_dir, url_to_name=url_to_name, - verbose=verbose, ) + verbose=verbose, + ) if verbose: print("# opening %s" % dest, file=sys.stderr) assert os.path.exists(dest) @@ -143,8 +143,9 @@ def cached_url_opener( data = f.read(200) os.remove(dest) raise ValueError( - "%s (%s) is not a tar archive, but a %s, contains %s" % - (dest, url, ftype, repr(data))) + "%s (%s) is not a tar archive, but a %s, contains %s" + % (dest, url, ftype, repr(data)) + ) try: stream = open(dest, "rb") sample.update(stream=stream) @@ -157,7 +158,7 @@ def cached_url_opener( continue raise exn except Exception as exn: - exn.args = exn.args + (url, ) + exn.args = exn.args + (url,) if handler(exn): continue else: @@ -165,13 +166,14 @@ def cached_url_opener( def cached_tarfile_samples( - src, - handler=reraise_exception, - cache_size=-1, - cache_dir=None, - verbose=False, - url_to_name=pipe_cleaner, - always=False, ): + src, + handler=reraise_exception, + cache_size=-1, + cache_dir=None, + verbose=False, + url_to_name=pipe_cleaner, + always=False, +): streams = cached_url_opener( src, handler=handler, @@ -179,7 +181,8 @@ def cached_tarfile_samples( cache_dir=cache_dir, verbose=verbose, url_to_name=url_to_name, - always=always, ) + always=always, + ) samples = tar_file_and_group_expander(streams, handler=handler) return samples diff --git a/paddlespeech/audio/streamdata/compat.py b/paddlespeech/audio/streamdata/compat.py index 9012eeb10..deda53384 100644 --- a/paddlespeech/audio/streamdata/compat.py +++ b/paddlespeech/audio/streamdata/compat.py @@ -2,17 +2,17 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset -import yaml +from dataclasses import dataclass +from itertools import islice +from typing import List + +import braceexpand, yaml from . import autodecode -from . import cache -from . import filters -from . import shardlists -from . import tariterators +from . import cache, filters, shardlists, tariterators from .filters import reraise_exception -from .paddle_utils import DataLoader -from .paddle_utils import IterableDataset from .pipeline import DataPipeline +from .paddle_utils import DataLoader, IterableDataset class FluidInterface: @@ -26,8 +26,7 @@ class FluidInterface: return self.compose(filters.unbatched()) def listed(self, batchsize, partial=True): - return self.compose( - filters.batched(), batchsize=batchsize, collation_fn=None) + return self.compose(filters.batched(), batchsize=batchsize, collation_fn=None) def unlisted(self): return self.compose(filters.unlisted()) @@ -44,19 +43,9 @@ class FluidInterface: def map(self, f, handler=reraise_exception): return self.compose(filters.map(f, handler=handler)) - def decode(self, - *args, - pre=None, - post=None, - only=None, - partial=False, - handler=reraise_exception): - handlers = [ - autodecode.ImageHandler(x) if isinstance(x, str) else x - for x in args - ] - decoder = autodecode.Decoder( - handlers, pre=pre, post=post, only=only, partial=partial) + def decode(self, *args, pre=None, post=None, only=None, partial=False, handler=reraise_exception): + handlers = [autodecode.ImageHandler(x) if isinstance(x, str) else x for x in args] + decoder = autodecode.Decoder(handlers, pre=pre, post=post, only=only, partial=partial) return self.map(decoder, handler=handler) def map_dict(self, handler=reraise_exception, **kw): @@ -91,12 +80,12 @@ class FluidInterface: def audio_data_filter(self, *args, **kw): return self.compose(filters.audio_data_filter(*args, **kw)) - + def audio_tokenize(self, *args, **kw): return self.compose(filters.audio_tokenize(*args, **kw)) def resample(self, *args, **kw): - return self.compose(filters.resample(*args, **kw)) + return self.compose(filters.resample(*args, **kw)) def audio_compute_fbank(self, *args, **kw): return self.compose(filters.audio_compute_fbank(*args, **kw)) @@ -113,28 +102,27 @@ class FluidInterface: def audio_cmvn(self, cmvn_file): return self.compose(filters.audio_cmvn(cmvn_file)) - class WebDataset(DataPipeline, FluidInterface): """Small fluid-interface wrapper for DataPipeline.""" def __init__( - self, - urls, - handler=reraise_exception, - resampled=False, - repeat=False, - shardshuffle=None, - cache_size=0, - cache_dir=None, - detshuffle=False, - nodesplitter=shardlists.single_node_only, - verbose=False, ): + self, + urls, + handler=reraise_exception, + resampled=False, + repeat=False, + shardshuffle=None, + cache_size=0, + cache_dir=None, + detshuffle=False, + nodesplitter=shardlists.single_node_only, + verbose=False, + ): super().__init__() if isinstance(urls, IterableDataset): assert not resampled self.append(urls) - elif isinstance(urls, str) and (urls.endswith(".yaml") or - urls.endswith(".yml")): + elif isinstance(urls, str) and (urls.endswith(".yaml") or urls.endswith(".yml")): with (open(urls)) as stream: spec = yaml.safe_load(stream) assert "datasets" in spec @@ -164,7 +152,9 @@ class WebDataset(DataPipeline, FluidInterface): handler=handler, verbose=verbose, cache_size=cache_size, - cache_dir=cache_dir, )) + cache_dir=cache_dir, + ) + ) class FluidWrapper(DataPipeline, FluidInterface): diff --git a/paddlespeech/audio/streamdata/extradatasets.py b/paddlespeech/audio/streamdata/extradatasets.py index 76361c24a..e6d617724 100644 --- a/paddlespeech/audio/streamdata/extradatasets.py +++ b/paddlespeech/audio/streamdata/extradatasets.py @@ -5,10 +5,20 @@ # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset # + + """Train PyTorch models directly from POSIX tar archive. Code works locally or over HTTP connections. """ + +import itertools as itt +import os +import random +import sys + +import braceexpand + from . import utils from .paddle_utils import IterableDataset from .utils import PipelineStage @@ -53,7 +63,8 @@ class repeatedly(IterableDataset, PipelineStage): return utils.repeatedly( source, nepochs=self.nepochs, - nbatches=self.nbatches, ) + nbatches=self.nbatches, + ) class with_epoch(IterableDataset): diff --git a/paddlespeech/audio/streamdata/filters.py b/paddlespeech/audio/streamdata/filters.py index 68d6830bb..82b9c6bab 100644 --- a/paddlespeech/audio/streamdata/filters.py +++ b/paddlespeech/audio/streamdata/filters.py @@ -3,6 +3,7 @@ # This file is part of the WebDataset library. # See the LICENSE file for licensing terms (BSD-style). # + # Modified from https://github.com/webdataset/webdataset # Modified from wenet(https://github.com/wenet-e2e/wenet) """A collection of iterators for data transformations. @@ -11,29 +12,28 @@ These functions are plain iterator functions. You can find curried versions in webdataset.filters, and you can find IterableDataset wrappers in webdataset.processing. """ + import io -import itertools -import os -import random -import re -import sys -import time from fnmatch import fnmatch -from functools import reduce +import re +import itertools, os, random, sys, time +from functools import reduce, wraps -import paddle +import numpy as np from . import autodecode -from . import utils +from . import utils +from .paddle_utils import PaddleTensor +from .utils import PipelineStage + from .. import backends from ..compliance import kaldi +import paddle from ..transform.cmvn import GlobalCMVN -from ..transform.spec_augment import freq_mask -from ..transform.spec_augment import time_mask -from ..transform.spec_augment import time_warp from ..utils.tensor_utils import pad_sequence -from .utils import PipelineStage - +from ..transform.spec_augment import time_warp +from ..transform.spec_augment import time_mask +from ..transform.spec_augment import freq_mask class FilterFunction(object): """Helper class for currying pipeline stages. @@ -159,12 +159,10 @@ def transform_with(sample, transformers): result[i] = f(sample[i]) return result - ### # Iterators ### - def _info(data, fmt=None, n=3, every=-1, width=50, stream=sys.stderr, name=""): """Print information about the samples that are passing through. @@ -280,16 +278,10 @@ def _log_keys(data, logfile=None): log_keys = pipelinefilter(_log_keys) -def _minedecode(x): - if isinstance(x, str): - return autodecode.imagehandler(x) - else: - return x - - def _decode(data, *args, handler=reraise_exception, **kw): """Decode data based on the decoding functions given as arguments.""" - decoder = _minedecode + + decoder = lambda x: autodecode.imagehandler(x) if isinstance(x, str) else x handlers = [decoder(x) for x in args] f = autodecode.Decoder(handlers, **kw) @@ -333,24 +325,15 @@ def _rename(data, handler=reraise_exception, keep=True, **kw): for sample in data: try: if not keep: - yield { - k: getfirst(sample, v, missing_is_error=True) - for k, v in kw.items() - } + yield {k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()} else: def listify(v): return v.split(";") if isinstance(v, str) else v to_be_replaced = {x for v in kw.values() for x in listify(v)} - result = { - k: v - for k, v in sample.items() if k not in to_be_replaced - } - result.update({ - k: getfirst(sample, v, missing_is_error=True) - for k, v in kw.items() - }) + result = {k: v for k, v in sample.items() if k not in to_be_replaced} + result.update({k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()}) yield result except Exception as exn: if handler(exn): @@ -398,11 +381,7 @@ def _map_dict(data, handler=reraise_exception, **kw): map_dict = pipelinefilter(_map_dict) -def _to_tuple(data, - *args, - handler=reraise_exception, - missing_is_error=True, - none_is_error=None): +def _to_tuple(data, *args, handler=reraise_exception, missing_is_error=True, none_is_error=None): """Convert dict samples to tuples.""" if none_is_error is None: none_is_error = missing_is_error @@ -411,10 +390,7 @@ def _to_tuple(data, for sample in data: try: - result = tuple([ - getfirst(sample, f, missing_is_error=missing_is_error) - for f in args - ]) + result = tuple([getfirst(sample, f, missing_is_error=missing_is_error) for f in args]) if none_is_error and any(x is None for x in result): raise ValueError(f"to_tuple {args} got {sample.keys()}") yield result @@ -487,28 +463,19 @@ rsample = pipelinefilter(_rsample) slice = pipelinefilter(itertools.islice) -def _extract_keys(source, - *patterns, - duplicate_is_error=True, - ignore_missing=False): +def _extract_keys(source, *patterns, duplicate_is_error=True, ignore_missing=False): for sample in source: result = [] for pattern in patterns: - pattern = pattern.split(";") if isinstance(pattern, - str) else pattern - matches = [ - x for x in sample.keys() - if any(fnmatch("." + x, p) for p in pattern) - ] + pattern = pattern.split(";") if isinstance(pattern, str) else pattern + matches = [x for x in sample.keys() if any(fnmatch("." + x, p) for p in pattern)] if len(matches) == 0: if ignore_missing: continue else: - raise ValueError( - f"Cannot find {pattern} in sample keys {sample.keys()}.") + raise ValueError(f"Cannot find {pattern} in sample keys {sample.keys()}.") if len(matches) > 1 and duplicate_is_error: - raise ValueError( - f"Multiple sample keys {sample.keys()} match {pattern}.") + raise ValueError(f"Multiple sample keys {sample.keys()} match {pattern}.") value = sample[matches[0]] result.append(value) yield tuple(result) @@ -517,12 +484,7 @@ def _extract_keys(source, extract_keys = pipelinefilter(_extract_keys) -def _rename_keys(source, - *args, - keep_unselected=False, - must_match=True, - duplicate_is_error=True, - **kw): +def _rename_keys(source, *args, keep_unselected=False, must_match=True, duplicate_is_error=True, **kw): renamings = [(pattern, output) for output, pattern in args] renamings += [(pattern, output) for output, pattern in kw.items()] for sample in source: @@ -542,15 +504,11 @@ def _rename_keys(source, continue if new_name in new_sample: if duplicate_is_error: - raise ValueError( - f"Duplicate value in sample {sample.keys()} after rename." - ) + raise ValueError(f"Duplicate value in sample {sample.keys()} after rename.") continue new_sample[new_name] = value if must_match and not all(matched.values()): - raise ValueError( - f"Not all patterns ({matched}) matched sample keys ({sample.keys()})." - ) + raise ValueError(f"Not all patterns ({matched}) matched sample keys ({sample.keys()}).") yield new_sample @@ -583,18 +541,18 @@ def find_decoder(decoders, path): if fname.startswith("__"): return lambda x: x for pattern, fun in decoders[::-1]: - if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(), - pattern): + if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(), pattern): return fun return None def _xdecode( - source, - *args, - must_decode=True, - defaults=default_decoders, - **kw, ): + source, + *args, + must_decode=True, + defaults=default_decoders, + **kw, +): decoders = list(defaults) + list(args) decoders += [("*." + k, v) for k, v in kw.items()] for sample in source: @@ -617,18 +575,18 @@ def _xdecode( new_sample[path] = value yield new_sample - xdecode = pipelinefilter(_xdecode) + def _audio_data_filter(source, - frame_shift=10, - max_length=10240, - min_length=10, - token_max_length=200, - token_min_length=1, - min_output_input_ratio=0.0005, - max_output_input_ratio=1): + frame_shift=10, + max_length=10240, + min_length=10, + token_max_length=200, + token_min_length=1, + min_output_input_ratio=0.0005, + max_output_input_ratio=1): """ Filter sample according to feature and label length Inplace operation. @@ -655,8 +613,7 @@ def _audio_data_filter(source, assert 'wav' in sample assert 'label' in sample # sample['wav'] is paddle.Tensor, we have 100 frames every second (default) - num_frames = sample['wav'].shape[1] / sample['sample_rate'] * ( - 1000 / frame_shift) + num_frames = sample['wav'].shape[1] / sample['sample_rate'] * (1000 / frame_shift) if num_frames < min_length: continue if num_frames > max_length: @@ -672,15 +629,13 @@ def _audio_data_filter(source, continue yield sample - audio_data_filter = pipelinefilter(_audio_data_filter) - def _audio_tokenize(source, - symbol_table, - bpe_model=None, - non_lang_syms=None, - split_with_space=False): + symbol_table, + bpe_model=None, + non_lang_syms=None, + split_with_space=False): """ Decode text to chars or BPE Inplace operation @@ -738,10 +693,8 @@ def _audio_tokenize(source, sample['label'] = label yield sample - audio_tokenize = pipelinefilter(_audio_tokenize) - def _audio_resample(source, resample_rate=16000): """ Resample data. Inplace operation. @@ -760,22 +713,18 @@ def _audio_resample(source, resample_rate=16000): waveform = sample['wav'] if sample_rate != resample_rate: sample['sample_rate'] = resample_rate - sample['wav'] = paddle.to_tensor( - backends.soundfile_backend.resample( - waveform.numpy(), - src_sr=sample_rate, - target_sr=resample_rate)) + sample['wav'] = paddle.to_tensor(backends.soundfile_backend.resample( + waveform.numpy(), src_sr = sample_rate, target_sr = resample_rate + )) yield sample - audio_resample = pipelinefilter(_audio_resample) - def _audio_compute_fbank(source, - num_mel_bins=80, - frame_length=25, - frame_shift=10, - dither=0.0): + num_mel_bins=80, + frame_length=25, + frame_shift=10, + dither=0.0): """ Extract fbank Args: @@ -797,33 +746,30 @@ def _audio_compute_fbank(source, waveform = sample['wav'] waveform = waveform * (1 << 15) # Only keep fname, feat, label - mat = kaldi.fbank( - waveform, - n_mels=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither, - energy_floor=0.0, - sr=sample_rate) + mat = kaldi.fbank(waveform, + n_mels=num_mel_bins, + frame_length=frame_length, + frame_shift=frame_shift, + dither=dither, + energy_floor=0.0, + sr=sample_rate) yield dict(fname=sample['fname'], label=sample['label'], feat=mat) audio_compute_fbank = pipelinefilter(_audio_compute_fbank) - -def _audio_spec_aug( - source, - max_w=5, - w_inplace=True, - w_mode="PIL", - max_f=30, - num_f_mask=2, - f_inplace=True, - f_replace_with_zero=False, - max_t=40, - num_t_mask=2, - t_inplace=True, - t_replace_with_zero=False, ): +def _audio_spec_aug(source, + max_w=5, + w_inplace=True, + w_mode="PIL", + max_f=30, + num_f_mask=2, + f_inplace=True, + f_replace_with_zero=False, + max_t=40, + num_t_mask=2, + t_inplace=True, + t_replace_with_zero=False,): """ Do spec augmentation Inplace operation @@ -847,23 +793,12 @@ def _audio_spec_aug( for sample in source: x = sample['feat'] x = x.numpy() - x = time_warp(x, max_time_warp=max_w, inplace=w_inplace, mode=w_mode) - x = freq_mask( - x, - F=max_f, - n_mask=num_f_mask, - inplace=f_inplace, - replace_with_zero=f_replace_with_zero) - x = time_mask( - x, - T=max_t, - n_mask=num_t_mask, - inplace=t_inplace, - replace_with_zero=t_replace_with_zero) + x = time_warp(x, max_time_warp=max_w, inplace = w_inplace, mode= w_mode) + x = freq_mask(x, F = max_f, n_mask = num_f_mask, inplace = f_inplace, replace_with_zero = f_replace_with_zero) + x = time_mask(x, T = max_t, n_mask = num_t_mask, inplace = t_inplace, replace_with_zero = t_replace_with_zero) sample['feat'] = paddle.to_tensor(x, dtype=paddle.float32) yield sample - audio_spec_aug = pipelinefilter(_audio_spec_aug) @@ -894,10 +829,8 @@ def _sort(source, sort_size=500): for x in buf: yield x - sort = pipelinefilter(_sort) - def _batched(source, batch_size=16): """ Static batch the data by `batch_size` @@ -917,10 +850,8 @@ def _batched(source, batch_size=16): if len(buf) > 0: yield buf - batched = pipelinefilter(_batched) - def dynamic_batched(source, max_frames_in_batch=12000): """ Dynamic batch the data until the total frames in batch reach `max_frames_in_batch` @@ -961,8 +892,8 @@ def _audio_padding(source): """ for sample in source: assert isinstance(sample, list) - feats_length = paddle.to_tensor( - [x['feat'].shape[0] for x in sample], dtype="int64") + feats_length = paddle.to_tensor([x['feat'].shape[0] for x in sample], + dtype="int64") order = paddle.argsort(feats_length, descending=True) feats_lengths = paddle.to_tensor( [sample[i]['feat'].shape[0] for i in order], dtype="int64") @@ -971,20 +902,20 @@ def _audio_padding(source): sorted_labels = [ paddle.to_tensor(sample[i]['label'], dtype="int32") for i in order ] - label_lengths = paddle.to_tensor( - [x.shape[0] for x in sorted_labels], dtype="int64") - padded_feats = pad_sequence( - sorted_feats, batch_first=True, padding_value=0) - padding_labels = pad_sequence( - sorted_labels, batch_first=True, padding_value=-1) - - yield (sorted_keys, padded_feats, feats_lengths, padding_labels, + label_lengths = paddle.to_tensor([x.shape[0] for x in sorted_labels], + dtype="int64") + padded_feats = pad_sequence(sorted_feats, + batch_first=True, + padding_value=0) + padding_labels = pad_sequence(sorted_labels, + batch_first=True, + padding_value=-1) + + yield (sorted_keys, padded_feats, feats_lengths, padding_labels, label_lengths) - audio_padding = pipelinefilter(_audio_padding) - def _audio_cmvn(source, cmvn_file): global_cmvn = GlobalCMVN(cmvn_file) for batch in source: @@ -992,16 +923,13 @@ def _audio_cmvn(source, cmvn_file): padded_feats = padded_feats.numpy() padded_feats = global_cmvn(padded_feats) padded_feats = paddle.to_tensor(padded_feats, dtype=paddle.float32) - yield (sorted_keys, padded_feats, feats_lengths, padding_labels, - label_lengths) - + yield (sorted_keys, padded_feats, feats_lengths, padding_labels, + label_lengths) audio_cmvn = pipelinefilter(_audio_cmvn) - def _placeholder(source): for data in source: yield data - placeholder = pipelinefilter(_placeholder) diff --git a/paddlespeech/audio/streamdata/gopen.py b/paddlespeech/audio/streamdata/gopen.py index 60a434603..457d048a6 100644 --- a/paddlespeech/audio/streamdata/gopen.py +++ b/paddlespeech/audio/streamdata/gopen.py @@ -3,12 +3,12 @@ # This file is part of the WebDataset library. # See the LICENSE file for licensing terms (BSD-style). # + + """Open URLs by calling subcommands.""" -import os -import re -import sys -from subprocess import PIPE -from subprocess import Popen + +import os, sys, re +from subprocess import PIPE, Popen from urllib.parse import urlparse # global used for printing additional node information during verbose output @@ -31,13 +31,14 @@ class Pipe: """ def __init__( - self, - *args, - mode=None, - timeout=7200.0, - ignore_errors=False, - ignore_status=[], - **kw, ): + self, + *args, + mode=None, + timeout=7200.0, + ignore_errors=False, + ignore_status=[], + **kw, + ): """Create an IO Pipe.""" self.ignore_errors = ignore_errors self.ignore_status = [0] + ignore_status @@ -74,7 +75,8 @@ class Pipe: if verbose: print( f"pipe exit [{self.status} {os.getpid()}:{self.proc.pid}] {self.args} {info}", - file=sys.stderr, ) + file=sys.stderr, + ) if self.status not in self.ignore_status and not self.ignore_errors: raise Exception(f"{self.args}: exit {self.status} (read) {info}") @@ -112,11 +114,9 @@ class Pipe: self.close() -def set_options(obj, - timeout=None, - ignore_errors=None, - ignore_status=None, - handler=None): +def set_options( + obj, timeout=None, ignore_errors=None, ignore_status=None, handler=None +): """Set options for Pipes. This function can be called on any stream. It will set pipe options only @@ -168,14 +168,16 @@ def gopen_pipe(url, mode="rb", bufsize=8192): mode=mode, shell=True, bufsize=bufsize, - ignore_status=[141], ) # skipcq: BAN-B604 + ignore_status=[141], + ) # skipcq: BAN-B604 elif mode[0] == "w": return Pipe( cmd, mode=mode, shell=True, bufsize=bufsize, - ignore_status=[141], ) # skipcq: BAN-B604 + ignore_status=[141], + ) # skipcq: BAN-B604 else: raise ValueError(f"{mode}: unknown mode") @@ -194,7 +196,8 @@ def gopen_curl(url, mode="rb", bufsize=8192): mode=mode, shell=True, bufsize=bufsize, - ignore_status=[141, 23], ) # skipcq: BAN-B604 + ignore_status=[141, 23], + ) # skipcq: BAN-B604 elif mode[0] == "w": cmd = f"curl -s -L -T - '{url}'" return Pipe( @@ -202,7 +205,8 @@ def gopen_curl(url, mode="rb", bufsize=8192): mode=mode, shell=True, bufsize=bufsize, - ignore_status=[141, 26], ) # skipcq: BAN-B604 + ignore_status=[141, 26], + ) # skipcq: BAN-B604 else: raise ValueError(f"{mode}: unknown mode") @@ -222,13 +226,15 @@ def gopen_htgs(url, mode="rb", bufsize=8192): mode=mode, shell=True, bufsize=bufsize, - ignore_status=[141, 23], ) # skipcq: BAN-B604 + ignore_status=[141, 23], + ) # skipcq: BAN-B604 elif mode[0] == "w": raise ValueError(f"{mode}: cannot write") else: raise ValueError(f"{mode}: unknown mode") + def gopen_gsutil(url, mode="rb", bufsize=8192): """Open a URL with `curl`. @@ -243,7 +249,8 @@ def gopen_gsutil(url, mode="rb", bufsize=8192): mode=mode, shell=True, bufsize=bufsize, - ignore_status=[141, 23], ) # skipcq: BAN-B604 + ignore_status=[141, 23], + ) # skipcq: BAN-B604 elif mode[0] == "w": cmd = f"gsutil cp - '{url}'" return Pipe( @@ -251,11 +258,13 @@ def gopen_gsutil(url, mode="rb", bufsize=8192): mode=mode, shell=True, bufsize=bufsize, - ignore_status=[141, 26], ) # skipcq: BAN-B604 + ignore_status=[141, 26], + ) # skipcq: BAN-B604 else: raise ValueError(f"{mode}: unknown mode") + def gopen_error(url, *args, **kw): """Raise a value error. @@ -276,7 +285,8 @@ gopen_schemes = dict( ftps=gopen_curl, scp=gopen_curl, gs=gopen_gsutil, - htgs=gopen_htgs, ) + htgs=gopen_htgs, +) def gopen(url, mode="rb", bufsize=8192, **kw): diff --git a/paddlespeech/audio/streamdata/handlers.py b/paddlespeech/audio/streamdata/handlers.py index 0173e5373..7f3d28b62 100644 --- a/paddlespeech/audio/streamdata/handlers.py +++ b/paddlespeech/audio/streamdata/handlers.py @@ -3,6 +3,7 @@ # This file is part of the WebDataset library. # See the LICENSE file for licensing terms (BSD-style). # + """Pluggable exception handlers. These are functions that take an exception as an argument and then return... @@ -13,8 +14,8 @@ These are functions that take an exception as an argument and then return... They are used as handler= arguments in much of the library. """ -import time -import warnings + +import time, warnings def reraise_exception(exn): diff --git a/paddlespeech/audio/streamdata/mix.py b/paddlespeech/audio/streamdata/mix.py index 37556ed94..7d790f00f 100644 --- a/paddlespeech/audio/streamdata/mix.py +++ b/paddlespeech/audio/streamdata/mix.py @@ -5,12 +5,17 @@ # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset # + """Classes for mixing samples from multiple sources.""" -import random + +import itertools, os, random, time, sys +from functools import reduce, wraps import numpy as np -from .paddle_utils import IterableDataset +from . import autodecode, utils +from .paddle_utils import PaddleTensor, IterableDataset +from .utils import PipelineStage def round_robin_shortest(*sources): diff --git a/paddlespeech/audio/streamdata/paddle_utils.py b/paddlespeech/audio/streamdata/paddle_utils.py index c2ad8756b..02bc4c841 100644 --- a/paddlespeech/audio/streamdata/paddle_utils.py +++ b/paddlespeech/audio/streamdata/paddle_utils.py @@ -5,11 +5,12 @@ # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset # + """Mock implementations of paddle interfaces when paddle is not available.""" + try: - from paddle.io import DataLoader - from paddle.io import IterableDataset + from paddle.io import DataLoader, IterableDataset except ModuleNotFoundError: class IterableDataset: @@ -21,3 +22,12 @@ except ModuleNotFoundError: """Empty implementation of DataLoader when paddle is not available.""" pass + +try: + from paddle import Tensor as PaddleTensor +except ModuleNotFoundError: + + class TorchTensor: + """Empty implementation of PaddleTensor when paddle is not available.""" + + pass diff --git a/paddlespeech/audio/streamdata/pipeline.py b/paddlespeech/audio/streamdata/pipeline.py index ff16760ae..7339a762a 100644 --- a/paddlespeech/audio/streamdata/pipeline.py +++ b/paddlespeech/audio/streamdata/pipeline.py @@ -3,12 +3,15 @@ # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset #%% -import copy -import sys +import copy, os, random, sys, time +from dataclasses import dataclass from itertools import islice +from typing import List -from .paddle_utils import DataLoader -from .paddle_utils import IterableDataset +import braceexpand, yaml + +from .handlers import reraise_exception +from .paddle_utils import DataLoader, IterableDataset from .utils import PipelineStage @@ -19,7 +22,8 @@ def add_length_method(obj): Combined = type( obj.__class__.__name__ + "_Length", (obj.__class__, IterableDataset), - {"__len__": length}, ) + {"__len__": length}, + ) obj.__class__ = Combined return obj diff --git a/paddlespeech/audio/streamdata/shardlists.py b/paddlespeech/audio/streamdata/shardlists.py index 54f501052..cfaf9a64b 100644 --- a/paddlespeech/audio/streamdata/shardlists.py +++ b/paddlespeech/audio/streamdata/shardlists.py @@ -4,30 +4,28 @@ # This file is part of the WebDataset library. # See the LICENSE file for licensing terms (BSD-style). # + # Modified from https://github.com/webdataset/webdataset + """Train PyTorch models directly from POSIX tar archive. Code works locally or over HTTP connections. """ -import os -import random -import sys -import time -from dataclasses import dataclass -from dataclasses import field + +import os, random, sys, time +from dataclasses import dataclass, field from itertools import islice from typing import List -import braceexpand -import yaml +import braceexpand, yaml from . import utils -from ..utils.log import Logger from .filters import pipelinefilter from .paddle_utils import IterableDataset -logger = Logger(__name__) +from ..utils.log import Logger +logger = Logger(__name__) def expand_urls(urls): if isinstance(urls, str): urllist = urls.split("::") @@ -66,8 +64,7 @@ class SimpleShardList(IterableDataset): def split_by_node(src, group=None): - rank, world_size, worker, num_workers = utils.paddle_worker_info( - group=group) + rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group) logger.info(f"world_size:{world_size}, rank:{rank}") if world_size > 1: for s in islice(src, rank, None, world_size): @@ -78,11 +75,9 @@ def split_by_node(src, group=None): def single_node_only(src, group=None): - rank, world_size, worker, num_workers = utils.paddle_worker_info( - group=group) + rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group) if world_size > 1: - raise ValueError( - "input pipeline needs to be reconfigured for multinode training") + raise ValueError("input pipeline needs to be reconfigured for multinode training") for s in src: yield s @@ -109,8 +104,7 @@ def resampled_(src, n=sys.maxsize): rng = random.Random(seed) print("# resampled loading", file=sys.stderr) items = list(src) - print( - f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr) + print(f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr) for i in range(n): yield rng.choice(items) @@ -124,9 +118,7 @@ def non_empty(src): yield s count += 1 if count == 0: - raise ValueError( - "pipeline stage received no data at all and this was declared as an error" - ) + raise ValueError("pipeline stage received no data at all and this was declared as an error") @dataclass @@ -146,6 +138,10 @@ def expand(s): return os.path.expanduser(os.path.expandvars(s)) +class MultiShardSample(IterableDataset): + def __init__(self, fname): + """Construct a shardlist from multiple sources using a YAML spec.""" + self.epoch = -1 class MultiShardSample(IterableDataset): def __init__(self, fname): """Construct a shardlist from multiple sources using a YAML spec.""" @@ -160,23 +156,20 @@ class MultiShardSample(IterableDataset): else: with open(fname) as stream: spec = yaml.safe_load(stream) - assert set(spec.keys()).issubset( - set("prefix datasets buckets".split())), list(spec.keys()) + assert set(spec.keys()).issubset(set("prefix datasets buckets".split())), list(spec.keys()) prefix = expand(spec.get("prefix", "")) self.sources = [] for ds in spec["datasets"]: - assert set(ds.keys()).issubset( - set("buckets name shards resample choose".split())), list( - ds.keys()) + assert set(ds.keys()).issubset(set("buckets name shards resample choose".split())), list( + ds.keys() + ) buckets = ds.get("buckets", spec.get("buckets", [])) if isinstance(buckets, str): buckets = [buckets] buckets = [expand(s) for s in buckets] if buckets == []: buckets = [""] - assert len( - buckets - ) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented" + assert len(buckets) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented" bucket = buckets[0] name = ds.get("name", "@" + bucket) urls = ds["shards"] @@ -184,19 +177,15 @@ class MultiShardSample(IterableDataset): urls = [urls] # urls = [u for url in urls for u in braceexpand.braceexpand(url)] urls = [ - prefix + os.path.join(bucket, u) - for url in urls for u in braceexpand.braceexpand(expand(url)) + prefix + os.path.join(bucket, u) for url in urls for u in braceexpand.braceexpand(expand(url)) ] resample = ds.get("resample", -1) nsample = ds.get("choose", -1) if nsample > len(urls): - raise ValueError( - f"perepoch {nsample} must be no greater than the number of shards" - ) + raise ValueError(f"perepoch {nsample} must be no greater than the number of shards") if (nsample > 0) and (resample > 0): raise ValueError("specify only one of perepoch or choose") - entry = MSSource( - name=name, urls=urls, perepoch=nsample, resample=resample) + entry = MSSource(name=name, urls=urls, perepoch=nsample, resample=resample) self.sources.append(entry) print(f"# {name} {len(urls)} {nsample}", file=sys.stderr) @@ -214,7 +203,7 @@ class MultiShardSample(IterableDataset): # sample without replacement l = list(source.urls) self.rng.shuffle(l) - l = l[:source.perepoch] + l = l[: source.perepoch] else: l = list(source.urls) result += l @@ -238,11 +227,12 @@ class ResampledShards(IterableDataset): """An iterable dataset yielding a list of urls.""" def __init__( - self, - urls, - nshards=sys.maxsize, - worker_seed=None, - deterministic=False, ): + self, + urls, + nshards=sys.maxsize, + worker_seed=None, + deterministic=False, + ): """Sample shards from the shard list with replacement. :param urls: a list of URLs as a Python list or brace notation string @@ -262,8 +252,7 @@ class ResampledShards(IterableDataset): if self.deterministic: seed = utils.make_seed(self.worker_seed(), self.epoch) else: - seed = utils.make_seed(self.worker_seed(), self.epoch, - os.getpid(), time.time_ns(), os.urandom(4)) + seed = utils.make_seed(self.worker_seed(), self.epoch, os.getpid(), time.time_ns(), os.urandom(4)) if os.environ.get("WDS_SHOW_SEED", "0") == "1": print(f"# ResampledShards seed {seed}") self.rng = random.Random(seed) diff --git a/paddlespeech/audio/streamdata/tariterators.py b/paddlespeech/audio/streamdata/tariterators.py index 79b81c0ce..b1616918c 100644 --- a/paddlespeech/audio/streamdata/tariterators.py +++ b/paddlespeech/audio/streamdata/tariterators.py @@ -3,12 +3,13 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # This file is part of the WebDataset library. # See the LICENSE file for licensing terms (BSD-style). + # Modified from https://github.com/webdataset/webdataset # Modified from wenet(https://github.com/wenet-e2e/wenet) + """Low level iteration functions for tar archives.""" -import random -import re -import tarfile + +import random, re, tarfile import braceexpand @@ -26,7 +27,6 @@ import numpy as np AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma']) - def base_plus_ext(path): """Split off all file extensions. @@ -47,8 +47,12 @@ def valid_sample(sample): :param sample: sample to be checked """ - return (sample is not None and isinstance(sample, dict) and - len(list(sample.keys())) > 0 and not sample.get("__bad__", False)) + return ( + sample is not None + and isinstance(sample, dict) + and len(list(sample.keys())) > 0 + and not sample.get("__bad__", False) + ) # FIXME: UNUSED @@ -75,16 +79,16 @@ def url_opener(data, handler=reraise_exception, **kw): sample.update(stream=stream) yield sample except Exception as exn: - exn.args = exn.args + (url, ) + exn.args = exn.args + (url,) if handler(exn): continue else: break -def tar_file_iterator(fileobj, - skip_meta=r"__[^/]*__($|/)", - handler=reraise_exception): +def tar_file_iterator( + fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception +): """Iterate over tar file, yielding filename, content pairs for the given tar stream. :param fileobj: byte stream suitable for tarfile @@ -99,8 +103,11 @@ def tar_file_iterator(fileobj, continue if fname is None: continue - if ("/" not in fname and fname.startswith(meta_prefix) and - fname.endswith(meta_suffix)): + if ( + "/" not in fname + and fname.startswith(meta_prefix) + and fname.endswith(meta_suffix) + ): # skipping metadata for now continue if skip_meta is not None and re.match(skip_meta, fname): @@ -111,10 +118,8 @@ def tar_file_iterator(fileobj, assert pos > 0 prefix, postfix = name[:pos], name[pos + 1:] if postfix == 'wav': - waveform, sample_rate = paddlespeech.audio.load( - stream.extractfile(tarinfo), normal=False) - result = dict( - fname=prefix, wav=waveform, sample_rate=sample_rate) + waveform, sample_rate = paddlespeech.audio.load(stream.extractfile(tarinfo), normal=False) + result = dict(fname=prefix, wav=waveform, sample_rate = sample_rate) else: txt = stream.extractfile(tarinfo).read().decode('utf8').strip() result = dict(fname=prefix, txt=txt) @@ -123,17 +128,16 @@ def tar_file_iterator(fileobj, stream.members = [] except Exception as exn: if hasattr(exn, "args") and len(exn.args) > 0: - exn.args = (exn.args[0] + " @ " + str(fileobj), ) + exn.args[1:] + exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:] if handler(exn): continue else: break del stream - -def tar_file_and_group_iterator(fileobj, - skip_meta=r"__[^/]*__($|/)", - handler=reraise_exception): +def tar_file_and_group_iterator( + fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception +): """ Expand a stream of open tar files into a stream of tar file contents. And groups the file with same prefix @@ -163,11 +167,8 @@ def tar_file_and_group_iterator(fileobj, if postfix == 'txt': example['txt'] = file_obj.read().decode('utf8').strip() elif postfix in AUDIO_FORMAT_SETS: - waveform, sample_rate = paddlespeech.audio.load( - file_obj, normal=False) - waveform = paddle.to_tensor( - np.expand_dims(np.array(waveform), 0), - dtype=paddle.float32) + waveform, sample_rate = paddlespeech.audio.load(file_obj, normal=False) + waveform = paddle.to_tensor(np.expand_dims(np.array(waveform),0), dtype=paddle.float32) example['wav'] = waveform example['sample_rate'] = sample_rate @@ -175,21 +176,19 @@ def tar_file_and_group_iterator(fileobj, example[postfix] = file_obj.read() except Exception as exn: if hasattr(exn, "args") and len(exn.args) > 0: - exn.args = (exn.args[0] + " @ " + str(fileobj), - ) + exn.args[1:] + exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:] if handler(exn): continue else: break valid = False - # logging.warning('error to parse {}'.format(name)) + # logging.warning('error to parse {}'.format(name)) prev_prefix = prefix if prev_prefix is not None: example['fname'] = prev_prefix yield example stream.close() - def tar_file_expander(data, handler=reraise_exception): """Expand a stream of open tar files into a stream of tar file contents. @@ -201,8 +200,9 @@ def tar_file_expander(data, handler=reraise_exception): assert isinstance(source, dict) assert "stream" in source for sample in tar_file_iterator(source["stream"]): - assert (isinstance(sample, dict) and "data" in sample and - "fname" in sample) + assert ( + isinstance(sample, dict) and "data" in sample and "fname" in sample + ) sample["__url__"] = url yield sample except Exception as exn: @@ -213,6 +213,8 @@ def tar_file_expander(data, handler=reraise_exception): break + + def tar_file_and_group_expander(data, handler=reraise_exception): """Expand a stream of open tar files into a stream of tar file contents. @@ -224,8 +226,9 @@ def tar_file_and_group_expander(data, handler=reraise_exception): assert isinstance(source, dict) assert "stream" in source for sample in tar_file_and_group_iterator(source["stream"]): - assert (isinstance(sample, dict) and "wav" in sample and - "txt" in sample and "fname" in sample) + assert ( + isinstance(sample, dict) and "wav" in sample and "txt" in sample and "fname" in sample + ) sample["__url__"] = url yield sample except Exception as exn: @@ -236,11 +239,7 @@ def tar_file_and_group_expander(data, handler=reraise_exception): break -def group_by_keys(data, - keys=base_plus_ext, - lcase=True, - suffixes=None, - handler=None): +def group_by_keys(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None): """Return function over iterator that groups key, value pairs into samples. :param keys: function that splits the key into key and extension (base_plus_ext) @@ -255,8 +254,8 @@ def group_by_keys(data, print( prefix, suffix, - current_sample.keys() - if isinstance(current_sample, dict) else None, ) + current_sample.keys() if isinstance(current_sample, dict) else None, + ) if prefix is None: continue if lcase: diff --git a/paddlespeech/audio/streamdata/utils.py b/paddlespeech/audio/streamdata/utils.py index 94dab9052..c7294f2bf 100644 --- a/paddlespeech/audio/streamdata/utils.py +++ b/paddlespeech/audio/streamdata/utils.py @@ -4,23 +4,22 @@ # This file is part of the WebDataset library. # See the LICENSE file for licensing terms (BSD-style). # + # Modified from https://github.com/webdataset/webdataset + """Miscellaneous utility functions.""" + import importlib import itertools as itt import os import re import sys -from typing import Any -from typing import Callable -from typing import Iterator -from typing import Union +from typing import Any, Callable, Iterator, Optional, Union from ..utils.log import Logger logger = Logger(__name__) - def make_seed(*args): seed = 0 for arg in args: @@ -38,7 +37,7 @@ def identity(x: Any) -> Any: return x -def safe_eval(s: str, expr: str="{}"): +def safe_eval(s: str, expr: str = "{}"): """Evaluate the given expression more safely.""" if re.sub("[^A-Za-z0-9_]", "", s) != s: raise ValueError(f"safe_eval: illegal characters in: '{s}'") @@ -55,9 +54,9 @@ def lookup_sym(sym: str, modules: list): return None -def repeatedly0(loader: Iterator, - nepochs: int=sys.maxsize, - nbatches: int=sys.maxsize): +def repeatedly0( + loader: Iterator, nepochs: int = sys.maxsize, nbatches: int = sys.maxsize +): """Repeatedly returns batches from a DataLoader.""" for epoch in range(nepochs): for sample in itt.islice(loader, nbatches): @@ -70,11 +69,12 @@ def guess_batchsize(batch: Union[tuple, list]): def repeatedly( - source: Iterator, - nepochs: int=None, - nbatches: int=None, - nsamples: int=None, - batchsize: Callable[..., int]=guess_batchsize, ): + source: Iterator, + nepochs: int = None, + nbatches: int = None, + nsamples: int = None, + batchsize: Callable[..., int] = guess_batchsize, +): """Repeatedly yield samples from an iterator.""" epoch = 0 batch = 0 @@ -93,7 +93,6 @@ def repeatedly( if nepochs is not None and epoch >= nepochs: return - def paddle_worker_info(group=None): """Return node and worker info for PyTorch and some distributed environments.""" rank = 0 @@ -117,7 +116,7 @@ def paddle_worker_info(group=None): else: try: from paddle.io import get_worker_info - worker_info = get_worker_info() + worker_info = paddle.io.get_worker_info() if worker_info is not None: worker = worker_info.id num_workers = worker_info.num_workers @@ -127,7 +126,6 @@ def paddle_worker_info(group=None): return rank, world_size, worker, num_workers - def paddle_worker_seed(group=None): """Compute a distinct, deterministic RNG seed for each worker and node.""" rank, world_size, worker, num_workers = paddle_worker_info(group=group) diff --git a/paddlespeech/audio/streamdata/writer.py b/paddlespeech/audio/streamdata/writer.py index 3928a3ba6..7d4f7703b 100644 --- a/paddlespeech/audio/streamdata/writer.py +++ b/paddlespeech/audio/streamdata/writer.py @@ -5,24 +5,18 @@ # See the LICENSE file for licensing terms (BSD-style). # Modified from https://github.com/webdataset/webdataset # + """Classes and functions for writing tar files and WebDataset files.""" -import io -import json -import pickle -import re -import tarfile -import time -from typing import Any -from typing import Callable -from typing import Optional -from typing import Union + +import io, json, pickle, re, tarfile, time +from typing import Any, Callable, Optional, Union import numpy as np from . import gopen -def imageencoder(image: Any, format: str="PNG"): # skipcq: PYL-W0622 +def imageencoder(image: Any, format: str = "PNG"): # skipcq: PYL-W0622 """Compress an image using PIL and return it as a string. Can handle float or uint8 images. @@ -73,7 +67,6 @@ def bytestr(data: Any): return data.encode("ascii") return str(data).encode("ascii") - def paddle_dumps(data: Any): """Dump data into a bytestring using paddle.dumps. @@ -89,7 +82,6 @@ def paddle_dumps(data: Any): paddle.save(data, stream) return stream.getvalue() - def numpy_dumps(data: np.ndarray): """Dump data into a bytestring using numpy npy format. @@ -147,8 +139,9 @@ def add_handlers(d, keys, value): def make_handlers(): """Create a list of handlers for encoding data.""" handlers = {} - add_handlers(handlers, "cls cls2 class count index inx id", - lambda x: str(x).encode("ascii")) + add_handlers( + handlers, "cls cls2 class count index inx id", lambda x: str(x).encode("ascii") + ) add_handlers(handlers, "txt text transcript", lambda x: x.encode("utf-8")) add_handlers(handlers, "html htm", lambda x: x.encode("utf-8")) add_handlers(handlers, "pyd pickle", pickle.dumps) @@ -159,8 +152,7 @@ def make_handlers(): add_handlers(handlers, "json jsn", lambda x: json.dumps(x).encode("utf-8")) add_handlers(handlers, "mp msgpack msg", mp_dumps) add_handlers(handlers, "cbor", cbor_dumps) - add_handlers(handlers, "jpg jpeg img image", - lambda data: imageencoder(data, "jpg")) + add_handlers(handlers, "jpg jpeg img image", lambda data: imageencoder(data, "jpg")) add_handlers(handlers, "png", lambda data: imageencoder(data, "png")) add_handlers(handlers, "pbm", lambda data: imageencoder(data, "pbm")) add_handlers(handlers, "pgm", lambda data: imageencoder(data, "pgm")) @@ -200,8 +192,7 @@ def encode_based_on_extension(sample: dict, handlers: dict): :param handlers: handlers for encoding """ return { - k: encode_based_on_extension1(v, k, handlers) - for k, v in list(sample.items()) + k: encode_based_on_extension1(v, k, handlers) for k, v in list(sample.items()) } @@ -267,14 +258,15 @@ class TarWriter: """ def __init__( - self, - fileobj, - user: str="bigdata", - group: str="bigdata", - mode: int=0o0444, - compress: Optional[bool]=None, - encoder: Union[None, bool, Callable]=True, - keep_meta: bool=False, ): + self, + fileobj, + user: str = "bigdata", + group: str = "bigdata", + mode: int = 0o0444, + compress: Optional[bool] = None, + encoder: Union[None, bool, Callable] = True, + keep_meta: bool = False, + ): """Create a tar writer. :param fileobj: stream to write data to @@ -338,7 +330,8 @@ class TarWriter: continue if not isinstance(v, (bytes, bytearray, memoryview)): raise ValueError( - f"{k} doesn't map to a bytes after encoding ({type(v)})") + f"{k} doesn't map to a bytes after encoding ({type(v)})" + ) key = obj["__key__"] for k in sorted(obj.keys()): if k == "__key__": @@ -356,8 +349,7 @@ class TarWriter: ti.uname = self.user ti.gname = self.group if not isinstance(v, (bytes, bytearray, memoryview)): - raise ValueError( - f"converter didn't yield bytes: {k}, {type(v)}") + raise ValueError(f"converter didn't yield bytes: {k}, {type(v)}") stream = io.BytesIO(v) self.tarstream.addfile(ti, stream) total += ti.size @@ -368,13 +360,14 @@ class ShardWriter: """Like TarWriter but splits into multiple shards.""" def __init__( - self, - pattern: str, - maxcount: int=100000, - maxsize: float=3e9, - post: Optional[Callable]=None, - start_shard: int=0, - **kw, ): + self, + pattern: str, + maxcount: int = 100000, + maxsize: float = 3e9, + post: Optional[Callable] = None, + start_shard: int = 0, + **kw, + ): """Create a ShardWriter. :param pattern: output file pattern @@ -407,7 +400,8 @@ class ShardWriter: self.fname, self.count, "%.1f GB" % (self.size / 1e9), - self.total, ) + self.total, + ) self.shard += 1 stream = open(self.fname, "wb") self.tarstream = TarWriter(stream, **self.kw) @@ -419,8 +413,11 @@ class ShardWriter: :param obj: sample to be written """ - if (self.tarstream is None or self.count >= self.maxcount or - self.size >= self.maxsize): + if ( + self.tarstream is None + or self.count >= self.maxcount + or self.size >= self.maxsize + ): self.next_stream() size = self.tarstream.write(obj) self.count += 1 diff --git a/paddlespeech/audio/text/text_featurizer.py b/paddlespeech/audio/text/text_featurizer.py index bcd6df54b..91c4d75c3 100644 --- a/paddlespeech/audio/text/text_featurizer.py +++ b/paddlespeech/audio/text/text_featurizer.py @@ -17,7 +17,6 @@ from typing import Union import sentencepiece as spm -from ..utils.log import Logger from .utility import BLANK from .utility import EOS from .utility import load_dict @@ -25,6 +24,7 @@ from .utility import MASKCTC from .utility import SOS from .utility import SPACE from .utility import UNK +from ..utils.log import Logger logger = Logger(__name__) diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py index 0825caec8..8044dc36f 100644 --- a/paddlespeech/audio/transform/perturb.py +++ b/paddlespeech/audio/transform/perturb.py @@ -12,16 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified from espnet(https://github.com/espnet/espnet) -import io -import os - -import h5py import librosa import numpy -import numpy as np import scipy import soundfile +import io +import os +import h5py +import numpy as np class SoundHDF5File(): """Collecting sound files to a HDF5 file @@ -110,7 +109,6 @@ class SoundHDF5File(): def close(self): self.file.close() - class SpeedPerturbation(): """SpeedPerturbation @@ -560,3 +558,4 @@ class RIRConvolve(): [scipy.convolve(x, r, mode="same") for r in rir], axis=-1) else: return scipy.convolve(x, rir, mode="same") + diff --git a/paddlespeech/audio/transform/spec_augment.py b/paddlespeech/audio/transform/spec_augment.py index b2635066f..029e7b8f5 100644 --- a/paddlespeech/audio/transform/spec_augment.py +++ b/paddlespeech/audio/transform/spec_augment.py @@ -14,7 +14,6 @@ # Modified from espnet(https://github.com/espnet/espnet) """Spec Augment module for preprocessing i.e., data augmentation""" import random - import numpy from PIL import Image diff --git a/paddlespeech/audio/transform/spectrogram.py b/paddlespeech/audio/transform/spectrogram.py index 864f3f994..99d50d81e 100644 --- a/paddlespeech/audio/transform/spectrogram.py +++ b/paddlespeech/audio/transform/spectrogram.py @@ -381,6 +381,36 @@ class LogMelSpectrogramKaldi(): mat = np.squeeze(mat.numpy()) return mat +class WavProcess(): + def __init__( + self, + dither=0.1): + """ + Args: + dither (float): Dithering constant + + Returns: + """ + + self.dither = dither + + def __call__(self, x, train): + """ + Args: + x (np.ndarray): shape (Ti,) + train (bool): True, train mode. + + Raises: + ValueError: not support (Ti, C) + + Returns: + np.ndarray: (T, D) + """ + dither = self.dither if train else 0.0 + if x.ndim != 1: + raise ValueError("Not support x: [Time, Channel]") + waveform = np.expand_dims(x, -1) + return waveform class LogMelSpectrogramKaldi_decay(): def __init__( diff --git a/paddlespeech/audio/transform/transformation.py b/paddlespeech/audio/transform/transformation.py index d24d6437c..e2f66dbf2 100644 --- a/paddlespeech/audio/transform/transformation.py +++ b/paddlespeech/audio/transform/transformation.py @@ -41,6 +41,7 @@ import_alias = dict( utterance_cmvn="paddlespeech.audio.transform.cmvn:UtteranceCMVN", fbank="paddlespeech.audio.transform.spectrogram:LogMelSpectrogram", spectrogram="paddlespeech.audio.transform.spectrogram:Spectrogram", + wav_process="paddlespeech.audio.transform.spectrogram:WavProcess", stft="paddlespeech.audio.transform.spectrogram:Stft", istft="paddlespeech.audio.transform.spectrogram:IStft", stft2fbank="paddlespeech.audio.transform.spectrogram:Stft2LogMelSpectrogram", diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 7296776f9..f9b4439ec 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -99,9 +99,8 @@ class ASRExecutor(BaseExecutor): '-y', action="store_true", default=False, - help='No additional parameters required. \ - Once set this parameter, it means accepting the request of the program by default, \ - which includes transforming the audio sample rate') + help='No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate' + ) self.parser.add_argument( '--rtf', action="store_true", @@ -341,7 +340,7 @@ class ASRExecutor(BaseExecutor): audio = np.round(audio).astype("int16") return audio - def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False): + def _check(self, audio_file: str, sample_rate: int, force_yes: bool): self.sample_rate = sample_rate if self.sample_rate != 16000 and self.sample_rate != 8000: logger.error( @@ -435,17 +434,8 @@ class ASRExecutor(BaseExecutor): for id_, input_ in task_source.items(): try: - res = self( - audio_file=input_, - model=model, - lang=lang, - sample_rate=sample_rate, - config=config, - ckpt_path=ckpt_path, - decode_method=decode_method, - force_yes=force_yes, - rtf=rtf, - device=device) + res = self(input_, model, lang, sample_rate, config, ckpt_path, + decode_method, force_yes, rtf, device) task_results[id_] = res except Exception as e: has_exceptions = True diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index b53eed88c..3800c36db 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -191,7 +191,7 @@ class BaseExecutor(ABC): line = line.strip() if not line: continue - k, v = line.split() # space or \t + k, v = line.split() # space or \t job_contents[k] = v return job_contents diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 111987246..48ca1f98d 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -70,14 +70,6 @@ class VectorExecutor(BaseExecutor): type=str, default=None, help="Checkpoint file of model.") - self.parser.add_argument( - '--yes', - '-y', - action="store_true", - default=False, - help='No additional parameters required. \ - Once set this parameter, it means accepting the request of the program by default, \ - which includes transforming the audio sample rate') self.parser.add_argument( '--config', type=str, @@ -117,7 +109,6 @@ class VectorExecutor(BaseExecutor): sample_rate = parser_args.sample_rate config = parser_args.config ckpt_path = parser_args.ckpt_path - force_yes = parser_args.yes device = parser_args.device # stage 1: configurate the verbose flag @@ -137,14 +128,8 @@ class VectorExecutor(BaseExecutor): # extract the speaker audio embedding if parser_args.task == "spk": logger.debug("do vector spk task") - res = self( - audio_file=input_, - model=model, - sample_rate=sample_rate, - config=config, - ckpt_path=ckpt_path, - force_yes=force_yes, - device=device) + res = self(input_, model, sample_rate, config, ckpt_path, + device) task_result[id_] = res elif parser_args.task == "score": logger.debug("do vector score task") @@ -160,22 +145,10 @@ class VectorExecutor(BaseExecutor): logger.debug( f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}" ) - enroll_embedding = self( - audio_file=enroll_audio, - model=model, - sample_rate=sample_rate, - config=config, - ckpt_path=ckpt_path, - force_yes=force_yes, - device=device) - test_embedding = self( - audio_file=test_audio, - model=model, - sample_rate=sample_rate, - config=config, - ckpt_path=ckpt_path, - force_yes=force_yes, - device=device) + enroll_embedding = self(enroll_audio, model, sample_rate, + config, ckpt_path, device) + test_embedding = self(test_audio, model, sample_rate, + config, ckpt_path, device) # get the score res = self.get_embeddings_score(enroll_embedding, @@ -249,7 +222,6 @@ class VectorExecutor(BaseExecutor): sample_rate: int=16000, config: os.PathLike=None, ckpt_path: os.PathLike=None, - force_yes: bool=False, device=paddle.get_device()): """Extract the audio embedding @@ -268,7 +240,7 @@ class VectorExecutor(BaseExecutor): """ # stage 0: check the audio format audio_file = os.path.abspath(audio_file) - if not self._check(audio_file, sample_rate, force_yes): + if not self._check(audio_file, sample_rate): sys.exit(-1) # stage 1: set the paddle runtime host device @@ -446,7 +418,7 @@ class VectorExecutor(BaseExecutor): logger.debug("audio extract the feat success") - def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False): + def _check(self, audio_file: str, sample_rate: int): """Check if the model sample match the audio sample rate Args: @@ -490,34 +462,13 @@ class VectorExecutor(BaseExecutor): logger.debug(f"The sample rate is {audio_sample_rate}") if audio_sample_rate != self.sample_rate: - logger.debug("The sample rate of the input file is not {}.\n \ + logger.error("The sample rate of the input file is not {}.\n \ The program will resample the wav file to {}.\n \ If the result does not meet your expectations,\n \ Please input the 16k 16 bit 1 channel wav file. \ ".format(self.sample_rate, self.sample_rate)) - if force_yes is False: - while (True): - logger.debug( - "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream." - ) - content = input("Input(Y/N):") - if content.strip() == "Y" or content.strip( - ) == "y" or content.strip() == "yes" or content.strip( - ) == "Yes": - logger.debug( - "change the sampele rate, channel to 16k and 1 channel" - ) - break - elif content.strip() == "N" or content.strip( - ) == "n" or content.strip() == "no" or content.strip( - ) == "No": - logger.debug("Exit the program") - return False - else: - logger.warning("Not regular input, please input again") - self.change_format = True + sys.exit(-1) else: logger.debug("The audio file format is right") - self.change_format = False return True diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py index f049879a3..872d564cd 100644 --- a/paddlespeech/resource/pretrained_models.py +++ b/paddlespeech/resource/pretrained_models.py @@ -1363,11 +1363,5 @@ g2pw_onnx_models = { 'md5': '7e049a55547da840502cf99e8a64f20e', }, - '1.1': { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip', - 'md5': - 'f8b60501770bff92ed6ce90860a610e6', - }, }, } diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py index 5fe2e16b9..f6476b9aa 100644 --- a/paddlespeech/s2t/__init__.py +++ b/paddlespeech/s2t/__init__.py @@ -114,7 +114,6 @@ if not hasattr(paddle.Tensor, 'new_full'): paddle.Tensor.new_full = new_full paddle.static.Variable.new_full = new_full - def contiguous(xs: paddle.Tensor) -> paddle.Tensor: return xs diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py index 66ea29d08..90b7d8a18 100644 --- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py +++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py @@ -20,8 +20,8 @@ import paddle import soundfile from yacs.config import CfgNode -from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer +from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.models.ds2 import DeepSpeech2Model from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.utils import mp_tools @@ -38,24 +38,24 @@ class DeepSpeech2Tester_hub(): self.args = args self.config = config self.audio_file = args.audio_file - - self.preprocess_conf = config.preprocess_config - self.preprocess_args = {"train": False} - self.preprocessing = Transformation(self.preprocess_conf) - - self.text_feature = TextFeaturizer( - unit_type=config.unit_type, - vocab=config.vocab_filepath, - spm_model_prefix=config.spm_model_prefix) - paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu') + self.collate_fn_test = SpeechCollator.from_config(config) + self._text_featurizer = TextFeaturizer( + unit_type=config.unit_type, vocab=None) def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg): - decode_batch_size = cfg.decode_batch_size - self.model.decoder.init_decoder( - decode_batch_size, vocab_list, cfg.decoding_method, - cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size, - cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch) - result_transcripts = self.model.decode(audio, audio_len) + result_transcripts = self.model.decode( + audio, + audio_len, + vocab_list, + decoding_method=cfg.decoding_method, + lang_model_path=cfg.lang_model_path, + beam_alpha=cfg.alpha, + beam_beta=cfg.beta, + beam_size=cfg.beam_size, + cutoff_prob=cfg.cutoff_prob, + cutoff_top_n=cfg.cutoff_top_n, + num_processes=cfg.num_proc_bsearch) + return result_transcripts @mp_tools.rank_zero_only @@ -64,23 +64,16 @@ class DeepSpeech2Tester_hub(): self.model.eval() cfg = self.config audio_file = self.audio_file - - audio, sample_rate = soundfile.read( - self.audio_file, dtype="int16", always_2d=True) - - audio = audio[:, 0] - logger.info(f"audio shape: {audio.shape}") - - # fbank - feat = self.preprocessing(audio, **self.preprocess_args) - logger.info(f"feat shape: {feat.shape}") - - audio_len = paddle.to_tensor(feat.shape[0]) - audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0) - + collate_fn_test = self.collate_fn_test + audio, _ = collate_fn_test.process_utterance( + audio_file=audio_file, transcript=" ") + audio_len = audio.shape[0] + audio = paddle.to_tensor(audio, dtype='float32') + audio_len = paddle.to_tensor(audio_len) + audio = paddle.unsqueeze(audio, axis=0) + vocab_list = collate_fn_test.vocab_list result_transcripts = self.compute_result_transcripts( - audio, audio_len, self.text_feature.vocab_list, cfg.decode) - + audio, audio_len, vocab_list, cfg.decode) logger.info("result_transcripts: " + result_transcripts[0]) def run_test(self): @@ -116,9 +109,11 @@ class DeepSpeech2Tester_hub(): def setup_model(self): config = self.config.clone() with UpdateConfig(config): - config.input_dim = config.feat_dim - config.output_dim = self.text_feature.vocab_size + config.input_dim = self.collate_fn_test.feature_size + config.output_dim = self.collate_fn_test.vocab_size + model = DeepSpeech2Model.from_config(config) + self.model = model def setup_checkpointer(self): diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py index db60083b0..67186081c 100644 --- a/paddlespeech/s2t/exps/u2/model.py +++ b/paddlespeech/s2t/exps/u2/model.py @@ -25,6 +25,8 @@ import paddle from paddle import distributed as dist from paddlespeech.s2t.frontend.featurizer import TextFeaturizer +from paddlespeech.s2t.io.dataloader import BatchDataLoader +from paddlespeech.s2t.io.dataloader import StreamDataLoader from paddlespeech.s2t.io.dataloader import DataLoaderFactory from paddlespeech.s2t.models.u2 import U2Model from paddlespeech.s2t.training.optimizer import OptimizerFactory @@ -107,8 +109,7 @@ class U2Trainer(Trainer): def valid(self): self.model.eval() if not self.use_streamdata: - logger.info( - f"Valid Total Examples: {len(self.valid_loader.dataset)}") + logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -135,8 +136,7 @@ class U2Trainer(Trainer): msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) if not self.use_streamdata: - msg += "batch: {}/{}, ".format(i + 1, - len(self.valid_loader)) + msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -157,8 +157,7 @@ class U2Trainer(Trainer): self.before_train() if not self.use_streamdata: - logger.info( - f"Train Total Examples: {len(self.train_loader.dataset)}") + logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -226,18 +225,14 @@ class U2Trainer(Trainer): config = self.config.clone() self.use_streamdata = config.get("use_stream_data", False) if self.train: - self.train_loader = DataLoaderFactory.get_dataloader( - 'train', config, self.args) - self.valid_loader = DataLoaderFactory.get_dataloader( - 'valid', config, self.args) + self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) logger.info("Setup train/valid Dataloader!") else: decode_batch_size = config.get('decode', dict()).get( 'decode_batch_size', 1) - self.test_loader = DataLoaderFactory.get_dataloader('test', config, - self.args) - self.align_loader = DataLoaderFactory.get_dataloader( - 'align', config, self.args) + self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) + self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args) logger.info("Setup test/align Dataloader!") def setup_model(self): @@ -250,8 +245,7 @@ class U2Trainer(Trainer): model_conf.output_dim = self.train_loader.vocab_size else: model_conf.input_dim = self.test_loader.feat_dim - model_conf.output_dim = self.test_loader.vocab_size - + model_conf.output_dim = 5538 model = U2Model.from_config(model_conf) if self.parallel: @@ -316,6 +310,11 @@ class U2Tester(U2Trainer): unit_type=self.config.unit_type, vocab=self.config.vocab_filepath, spm_model_prefix=self.config.spm_model_prefix) + + self.text_feature_test = TextFeaturizer( + unit_type=self.config.unit_type, + vocab='/home/zhangtianhao/workspace/PaddleSpeech/examples/aishell/asr1/data/lang_char/vocab.txt', + spm_model_prefix=self.config.spm_model_prefix) self.vocab_list = self.text_feature.vocab_list def id2token(self, texts, texts_len, text_feature): @@ -340,7 +339,7 @@ class U2Tester(U2Trainer): error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer start_time = time.time() - target_transcripts = self.id2token(texts, texts_len, self.text_feature) + target_transcripts = self.id2token(texts, texts_len, self.text_feature_test) result_transcripts, result_tokenids = self.model.decode( audio, audio_len, diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py index 073d74293..cb015c116 100644 --- a/paddlespeech/s2t/exps/u2_kaldi/model.py +++ b/paddlespeech/s2t/exps/u2_kaldi/model.py @@ -105,8 +105,7 @@ class U2Trainer(Trainer): def valid(self): self.model.eval() if not self.use_streamdata: - logger.info( - f"Valid Total Examples: {len(self.valid_loader.dataset)}") + logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -134,8 +133,7 @@ class U2Trainer(Trainer): msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) if not self.use_streamdata: - msg += "batch: {}/{}, ".format(i + 1, - len(self.valid_loader)) + msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -155,8 +153,7 @@ class U2Trainer(Trainer): self.before_train() if not self.use_streamdata: - logger.info( - f"Train Total Examples: {len(self.train_loader.dataset)}") + logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -168,8 +165,8 @@ class U2Trainer(Trainer): msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) if not self.use_streamdata: - msg += "batch : {}/{}, ".format( - batch_index + 1, len(self.train_loader)) + msg += "batch : {}/{}, ".format(batch_index + 1, + len(self.train_loader)) msg += "lr: {:>.8f}, ".format(self.lr_scheduler()) msg += "data time: {:>.3f}s, ".format(dataload_time) self.train_batch(batch_index, batch, msg) @@ -207,24 +204,21 @@ class U2Trainer(Trainer): self.use_streamdata = config.get("use_stream_data", False) if self.train: config = self.config.clone() - self.train_loader = DataLoaderFactory.get_dataloader( - 'train', config, self.args) + self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) config = self.config.clone() config['preprocess_config'] = None - self.valid_loader = DataLoaderFactory.get_dataloader( - 'valid', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) logger.info("Setup train/valid Dataloader!") else: config = self.config.clone() config['preprocess_config'] = None - self.test_loader = DataLoaderFactory.get_dataloader('test', config, - self.args) + self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) config = self.config.clone() config['preprocess_config'] = None - self.align_loader = DataLoaderFactory.get_dataloader( - 'align', config, self.args) + self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args) logger.info("Setup test/align Dataloader!") + def setup_model(self): config = self.config diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index d57c49546..603825435 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -121,8 +121,7 @@ class U2STTrainer(Trainer): def valid(self): self.model.eval() if not self.use_streamdata: - logger.info( - f"Valid Total Examples: {len(self.valid_loader.dataset)}") + logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") valid_losses = defaultdict(list) num_seen_utts = 1 total_loss = 0.0 @@ -156,8 +155,7 @@ class U2STTrainer(Trainer): msg += "epoch: {}, ".format(self.epoch) msg += "step: {}, ".format(self.iteration) if not self.use_streamdata: - msg += "batch: {}/{}, ".format(i + 1, - len(self.valid_loader)) + msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in valid_dump.items()) logger.info(msg) @@ -177,8 +175,7 @@ class U2STTrainer(Trainer): self.before_train() if not self.use_streamdata: - logger.info( - f"Train Total Examples: {len(self.train_loader.dataset)}") + logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") while self.epoch < self.config.n_epoch: with Timer("Epoch-Train Time Cost: {}"): self.model.train() @@ -251,16 +248,14 @@ class U2STTrainer(Trainer): config['load_transcript'] = load_transcript self.use_streamdata = config.get("use_stream_data", False) if self.train: - self.train_loader = DataLoaderFactory.get_dataloader( - 'train', config, self.args) - self.valid_loader = DataLoaderFactory.get_dataloader( - 'valid', config, self.args) + self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) logger.info("Setup train/valid Dataloader!") else: - self.test_loader = DataLoaderFactory.get_dataloader('test', config, - self.args) + self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) logger.info("Setup test Dataloader!") + def setup_model(self): config = self.config model_conf = config diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py b/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py new file mode 100644 index 000000000..185a92b8d --- /dev/null +++ b/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test.py b/paddlespeech/s2t/exps/wav2vec2/bin/test.py new file mode 100644 index 000000000..4d16d9fa9 --- /dev/null +++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py @@ -0,0 +1,66 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for U2 model.""" +import cProfile + +from yacs.config import CfgNode + +from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester +from paddlespeech.s2t.training.cli import default_argument_parser +from paddlespeech.s2t.utils.utility import print_arguments + +# TODO(hui zhang): dynamic load + + +def main_sp(config, args): + exp = Tester(config, args) + with exp.eval(): + exp.setup() + exp.run_test() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + # save asr result to + parser.add_argument( + '--dict-path', type=str, default=None, help='dict path.') + parser.add_argument( + "--result_file", type=str, help="path of save the asr result") + args = parser.parse_args() + print_arguments(args, globals()) + + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) + if args.decode_cfg: + decode_confs = CfgNode(new_allowed=True) + decode_confs.merge_from_file(args.decode_cfg) + config.decode = decode_confs + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + # Setting for profiling + pr = cProfile.Profile() + pr.runcall(main, config, args) + pr.dump_stats('test.profile') diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/train.py b/paddlespeech/s2t/exps/wav2vec2/bin/train.py new file mode 100644 index 000000000..b977b2a15 --- /dev/null +++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Trainer for U2 model.""" +import cProfile +import os + +from yacs.config import CfgNode + +from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer +from paddlespeech.s2t.training.cli import default_argument_parser +from paddlespeech.s2t.utils.utility import print_arguments + + +def main_sp(config, args): + exp = Trainer(config, args) + exp.setup() + exp.run() + + +def main(config, args): + main_sp(config, args) + + +if __name__ == "__main__": + parser = default_argument_parser() + args = parser.parse_args() + print_arguments(args, globals()) + + # https://yaml.org/type/float.html + config = CfgNode(new_allowed=True) + if args.config: + config.merge_from_file(args.config) + if args.opts: + config.merge_from_list(args.opts) + config.freeze() + print(config) + if args.dump_config: + with open(args.dump_config, 'w') as f: + print(config, file=f) + + # Setting for profiling + pr = cProfile.Profile() + pr.runcall(main, config, args) + pr.dump_stats(os.path.join(args.output, 'train.profile')) diff --git a/paddlespeech/s2t/exps/wav2vec2/model.py b/paddlespeech/s2t/exps/wav2vec2/model.py new file mode 100644 index 000000000..587a279b3 --- /dev/null +++ b/paddlespeech/s2t/exps/wav2vec2/model.py @@ -0,0 +1,465 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains wav2vec2 model.""" +import json +import os +import time +from collections import defaultdict +from collections import OrderedDict +from contextlib import nullcontext +from paddlespeech.s2t.utils import mp_tools + +import jsonlines +import numpy as np +import paddle +from paddle import distributed as dist +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer +from paddlespeech.s2t.io.dataloader import BatchDataLoader +from paddlespeech.s2t.io.dataloader import StreamDataLoader +from paddlespeech.s2t.io.dataloader import DataLoaderFactory +from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR +from paddlespeech.s2t.utils import error_rate + + +from paddlespeech.s2t.training.optimizer import OptimizerFactory +from paddlespeech.s2t.training.reporter import ObsScope +from paddlespeech.s2t.training.reporter import report +from paddlespeech.s2t.training.scheduler import LRSchedulerFactory +from paddlespeech.s2t.training.timer import Timer +from paddlespeech.s2t.training.trainer import Trainer +from paddlespeech.s2t.utils.utility import UpdateConfig +from paddlespeech.s2t.utils import layer_tools +from paddlespeech.s2t.utils.log import Log + +from paddlespeech.s2t.models.wav2vec2.speechbrain.processing.speech_augmentation import TimeDomainSpecAugment +import pdb + + +logger = Log(__name__).getlog() + +class Wav2Vec2ASRTrainer(Trainer): + def __init__(self, config, args): + super().__init__(config, args) + + def train_batch(self, batch_index, batch, msg): + train_conf = self.config + start = time.time() + + # forward + utt, wav, wavs_lens, target, target_lens = batch + wavs_lens_rate = wavs_lens / wav.shape[1] + target_lens_rate = target_lens / target.shape[1] + wav = wav[:,:,0] + if train_conf.augment: + wav = self.speech_augmentation(wav, wavs_lens_rate) + loss = self.model(wav, wavs_lens_rate, target, target_lens_rate) + # print(self.model.wav2vec2.feature_projection.projection.weight) + # print(self.model.wav2vec2.feature_extractor.conv_layers[0].conv.weight) + + # loss div by `batch_size * accum_grad` + loss /= train_conf.accum_grad + losses_np = {'loss': float(loss) * train_conf.accum_grad} + + # loss backward + if (batch_index + 1) % train_conf.accum_grad != 0: + # Disable gradient synchronizations across DDP processes. + # Within this context, gradients will be accumulated on module + # variables, which will later be synchronized. + # When using cpu w/o DDP, model does not have `no_sync` + context = self.model.no_sync if (hasattr(self.model, "no_sync") and + self.parallel) else nullcontext + else: + # Used for single gpu training and DDP gradient synchronization + # processes. + context = nullcontext + with context(): + loss.backward() + layer_tools.print_grads(self.model, print_func=None) + + # optimizer step old + if (batch_index + 1) % train_conf.accum_grad == 0: + self.optimizer.step() + self.optimizer.clear_grad() + self.lr_scheduler.step() + self.iteration += 1 + # optimizer step new + # if (batch_index + 1) % train_conf.accum_grad == 0: + # self.optimizer.step() + # self.optimizer.clear_grad() + # self.iteration += 1 + + iteration_time = time.time() - start + + for k, v in losses_np.items(): + report(k, v) + report("batch_size", self.config.batch_size) + report("accum", train_conf.accum_grad) + report("step_cost", iteration_time) + + if (batch_index + 1) % train_conf.accum_grad == 0: + if dist.get_rank() == 0 and self.visualizer: + losses_np_v = losses_np.copy() + losses_np_v.update({"lr": self.lr_scheduler()}) + for key, val in losses_np_v.items(): + self.visualizer.add_scalar( + tag='train/' + key, value=val, step=self.iteration - 1) + + @paddle.no_grad() + def valid(self): + self.model.eval() + if not self.use_streamdata: + logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}") + valid_losses = defaultdict(list) + num_seen_utts = 1 + total_loss = 0.0 + for i, batch in enumerate(self.valid_loader): + utt, wav, wavs_lens, target, target_lens = batch + wavs_lens_rate = wavs_lens / wav.shape[1] + target_lens_rate = target_lens / target.shape[1] + wav = wav[:,:,0] + loss = self.model(wav, wavs_lens_rate, target, target_lens_rate) + + if paddle.isfinite(loss): + num_utts = batch[1].shape[0] + num_seen_utts += num_utts + total_loss += float(loss) * num_utts + valid_losses['val_loss'].append(float(loss)) + + if (i + 1) % self.config.log_interval == 0: + valid_dump = {k: np.mean(v) for k, v in valid_losses.items()} + valid_dump['val_history_loss'] = total_loss / num_seen_utts + + # logging + msg = f"Valid: Rank: {dist.get_rank()}, " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + if not self.use_streamdata: + msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader)) + msg += ', '.join('{}: {:>.6f}'.format(k, v) + for k, v in valid_dump.items()) + logger.info(msg) + + logger.info('Rank {} Val info val_loss {}'.format( + dist.get_rank(), total_loss / num_seen_utts)) + return total_loss, num_seen_utts + + def do_train(self): + """The training process control by step.""" + # !!!IMPORTANT!!! + # Try to export the model by script, if fails, we should refine + # the code to satisfy the script export requirements + # script_model = paddle.jit.to_static(self.model) + # script_model_path = str(self.checkpoint_dir / 'init') + # paddle.jit.save(script_model, script_model_path) + + self.before_train() + + if not self.use_streamdata: + logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}") + while self.epoch < self.config.n_epoch: + with Timer("Epoch-Train Time Cost: {}"): + self.model.train() + try: + data_start_time = time.time() + for batch_index, batch in enumerate(self.train_loader): + dataload_time = time.time() - data_start_time + msg = "Train:" + observation = OrderedDict() + with ObsScope(observation): + report("Rank", dist.get_rank()) + report("epoch", self.epoch) + report('step', self.iteration) + report("lr", self.lr_scheduler()) + self.train_batch(batch_index, batch, msg) + self.after_train_batch() + report('iter', batch_index + 1) + if not self.use_streamdata: + report('total', len(self.train_loader)) + report('reader_cost', dataload_time) + observation['batch_cost'] = observation[ + 'reader_cost'] + observation['step_cost'] + observation['samples'] = observation['batch_size'] + observation['ips,samples/s'] = observation[ + 'batch_size'] / observation['batch_cost'] + for k, v in observation.items(): + msg += f" {k.split(',')[0]}: " + msg += f"{v:>.8f}" if isinstance(v, + float) else f"{v}" + msg += f" {k.split(',')[1]}" if len( + k.split(',')) == 2 else "" + msg += "," + msg = msg[:-1] # remove the last "," + if (batch_index + 1) % self.config.log_interval == 0: + logger.info(msg) + data_start_time = time.time() + except Exception as e: + logger.error(e) + raise e + with Timer("Eval Time Cost: {}"): + total_loss, num_seen_utts = self.valid() + if dist.get_world_size() > 1: + num_seen_utts = paddle.to_tensor(num_seen_utts) + # the default operator in all_reduce function is sum. + dist.all_reduce(num_seen_utts) + total_loss = paddle.to_tensor(total_loss) + dist.all_reduce(total_loss) + cv_loss = total_loss / num_seen_utts + cv_loss = float(cv_loss) + else: + cv_loss = total_loss / num_seen_utts + + logger.info( + 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss)) + if self.visualizer: + self.visualizer.add_scalar( + tag='eval/cv_loss', value=cv_loss, step=self.epoch) + self.visualizer.add_scalar( + tag='eval/lr', value=self.lr_scheduler(), step=self.epoch) + + self.save(tag=self.epoch, infos={'val_loss': cv_loss}) + self.new_epoch() + + def setup_dataloader(self): + config = self.config.clone() + self.use_streamdata = config.get("use_stream_data", False) + if self.train: + self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args) + self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args) + logger.info("Setup train/valid Dataloader!") + else: + decode_batch_size = config.get('decode', dict()).get( + 'decode_batch_size', 1) + self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args) + self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args) + logger.info("Setup test/align Dataloader!") + + def setup_model(self): + config = self.config + model_conf = config + + with UpdateConfig(model_conf): + if self.train: + model_conf.input_dim = self.train_loader.feat_dim + model_conf.output_dim = self.train_loader.vocab_size + else: + model_conf.input_dim = self.test_loader.feat_dim + model_conf.output_dim = self.test_loader.vocab_size + + model = Wav2vec2ASR.from_config(model_conf) + + if self.parallel: + model = paddle.DataParallel(model) + + # logger.info(f"{model}") + layer_tools.print_params(model, logger.info) + self.model = model + logger.info("Setup model!") + if model_conf.augment: + self.speech_augmentation = TimeDomainSpecAugment(sample_rate=16000, speeds=[95, 100, 105]) + + if not self.train: + return + + train_config = config + optim_type = train_config.model_optim + optim_conf = train_config.model_optim_conf + scheduler_type = train_config.scheduler + scheduler_conf = train_config.scheduler_conf + + scheduler_args = { + "learning_rate": optim_conf.lr, + "verbose": False, + "warmup_steps": scheduler_conf.warmup_steps, + "gamma": scheduler_conf.lr_decay, + "d_model": model_conf.dnn_neurons, + } + lr_scheduler = LRSchedulerFactory.from_args(scheduler_type, + scheduler_args) + + def optimizer_args( + config, + parameters, + lr_scheduler=None, ): + train_config = config + optim_type = train_config.model_optim + optim_conf = train_config.model_optim_conf + scheduler_type = train_config.scheduler + scheduler_conf = train_config.scheduler_conf + return { + "grad_clip": train_config.global_grad_clip, + "learning_rate": lr_scheduler + if lr_scheduler else optim_conf.lr, + "epsilon": optim_conf.epsilon, + "rho": optim_conf.rho, + "parameters": parameters, + "epsilon": 1e-9 if optim_type == 'noam' else None, + "beta1": 0.9 if optim_type == 'noam' else None, + "beat2": 0.98 if optim_type == 'noam' else None, + } + + # optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) + optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler) + + optimizer = OptimizerFactory.from_args(optim_type, optimzer_args) + + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + logger.info("Setup optimizer/lr_scheduler!") + + + +class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer): + def __init__(self, config, args): + super().__init__(config, args) + print(config) + self.text_featurizer = TextFeaturizer( + unit_type=config.unit_type, vocab=config.vocab_filepath) + self.vocab_list = self.text_featurizer.vocab_list + + def id2token(self, texts, texts_len): + """ ord() id to chr() chr """ + trans = [] + for text, n in zip(texts, texts_len): + n = n.numpy().item() + ids = text[:n] + trans.append( + self.text_featurizer.defeaturize(ids.numpy().tolist())) + return trans + + def compute_metrics(self, + utts, + audio, + audio_len, + texts, + texts_len, + fout=None): + decode_cfg = self.config.decode + errors_sum, len_refs, num_ins = 0.0, 0, 0 + errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors + error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer + + start_time = time.time() + target_transcripts = self.id2token(texts, texts_len) + result_transcripts, result_tokenids = self.model.decode( + audio, + audio_len, + text_feature=self.text_featurizer, + decoding_method=decode_cfg.decoding_method, + beam_size=decode_cfg.beam_size) + decode_time = time.time() - start_time + + for utt, target, result, rec_tids in zip( + utts, target_transcripts, result_transcripts, result_tokenids): + errors, len_ref = errors_func(target, result) + errors_sum += errors + len_refs += len_ref + num_ins += 1 + if fout: + fout.write({ + "utt": utt, + "refs": [target], + "hyps": [result], + "hyps_tokenid": [rec_tids], + }) + logger.info(f"Utt: {utt}") + logger.info(f"Ref: {target}") + logger.info(f"Hyp: {result}") + logger.info("One example error rate [%s] = %f" % ( + decode_cfg.error_rate_type, error_rate_func(target, result))) + + return dict( + errors_sum=errors_sum, + len_refs=len_refs, + num_ins=num_ins, # num examples + error_rate=errors_sum / len_refs, + error_rate_type=decode_cfg.error_rate_type, + num_frames=audio_len.sum().numpy().item(), + decode_time=decode_time) + + @mp_tools.rank_zero_only + @paddle.no_grad() + def test(self): + logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}") + self.model.eval() + + error_rate_type = None + errors_sum, len_refs, num_ins = 0.0, 0, 0 + num_frames = 0.0 + num_time = 0.0 + # Initialized the decoder in model + decode_cfg = self.config.decode + vocab_list = self.vocab_list + decode_batch_size = decode_cfg.decode_batch_size + # self.model.decoder.init_decoder( + # decode_batch_size, vocab_list, decode_cfg.decoding_method, + # decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta, + # decode_cfg.beam_size, decode_cfg.cutoff_prob, + # decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch) + + with jsonlines.open(self.args.result_file, 'w') as fout: + for i, batch in enumerate(self.test_loader): + metrics = self.compute_metrics(*batch, fout=fout) + num_frames += metrics['num_frames'] + num_time += metrics["decode_time"] + errors_sum += metrics['errors_sum'] + len_refs += metrics['len_refs'] + num_ins += metrics['num_ins'] + error_rate_type = metrics['error_rate_type'] + rtf = num_time / (num_frames) + logger.info( + "RTF: %f, Error rate [%s] (%d/?) = %f" % + (rtf, error_rate_type, num_ins, errors_sum / len_refs)) + + # logging + msg = "Test: " + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "Final error rate [%s] (%d/%d) = %f" % ( + error_rate_type, num_ins, num_ins, errors_sum / len_refs) + logger.info(msg) + + err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err' + err_type_str = "{}".format(error_rate_type) + with open(err_meta_path, 'w') as f: + data = json.dumps({ + "epoch": + self.epoch, + "step": + self.iteration, + "rtf": + rtf, + error_rate_type: + errors_sum / len_refs, + "dataset_hour": (num_frames) / 1000.0 / 3600.0, + "process_hour": + num_time / 1000.0 / 3600.0, + "num_examples": + num_ins, + "err_sum": + errors_sum, + "ref_len": + len_refs, + "decode_method": + self.config.decode.decoding_method, + }) + f.write(data + '\n') + + @paddle.no_grad() + def export(self): + infer_model = DeepSpeech2InferModel.from_pretrained( + self.test_loader, self.config, self.args.checkpoint_path) + infer_model.eval() + static_model = infer_model.export() + logger.info(f"Export code: {static_model.forward.code}") + paddle.jit.save(static_model, self.args.export_path) diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py index 4cc8274f9..735d29da2 100644 --- a/paddlespeech/s2t/io/dataloader.py +++ b/paddlespeech/s2t/io/dataloader.py @@ -22,16 +22,17 @@ import paddle from paddle.io import BatchSampler from paddle.io import DataLoader from paddle.io import DistributedBatchSampler -from yacs.config import CfgNode -import paddlespeech.audio.streamdata as streamdata -from paddlespeech.audio.text.text_featurizer import TextFeaturizer from paddlespeech.s2t.io.batchfy import make_batchset from paddlespeech.s2t.io.converter import CustomConverter from paddlespeech.s2t.io.dataset import TransformDataset from paddlespeech.s2t.io.reader import LoadInputsAndTargets from paddlespeech.s2t.utils.log import Log +import paddlespeech.audio.streamdata as streamdata +from paddlespeech.audio.text.text_featurizer import TextFeaturizer +from yacs.config import CfgNode + __all__ = ["BatchDataLoader", "StreamDataLoader"] logger = Log(__name__).getlog() @@ -60,7 +61,6 @@ def batch_collate(x): """ return x[0] - def read_preprocess_cfg(preprocess_conf_file): augment_conf = dict() preprocess_cfg = CfgNode(new_allowed=True) @@ -82,8 +82,7 @@ def read_preprocess_cfg(preprocess_conf_file): augment_conf['num_t_mask'] = process['n_mask'] augment_conf['t_inplace'] = process['inplace'] augment_conf['t_replace_with_zero'] = process['replace_with_zero'] - return augment_conf - + return augment_conf class StreamDataLoader(): def __init__(self, @@ -96,12 +95,12 @@ class StreamDataLoader(): frame_length=25, frame_shift=10, dither=0.0, - minlen_in: float=0.0, + minlen_in: float=0.0, maxlen_in: float=float('inf'), minlen_out: float=0.0, maxlen_out: float=float('inf'), resample_rate: int=16000, - shuffle_size: int=10000, + shuffle_size: int=10000, sort_size: int=1000, n_iter_processes: int=1, prefetch_factor: int=2, @@ -117,11 +116,11 @@ class StreamDataLoader(): text_featurizer = TextFeaturizer(unit_type, vocab_filepath) symbol_table = text_featurizer.vocab_dict - self.feat_dim = num_mel_bins - self.vocab_size = text_featurizer.vocab_size - + self.feat_dim = num_mel_bins + self.vocab_size = text_featurizer.vocab_size + augment_conf = read_preprocess_cfg(preprocess_conf) - + # The list of shard shardlist = [] with open(manifest_file, "r") as f: @@ -129,68 +128,58 @@ class StreamDataLoader(): shardlist.append(line.strip()) world_size = 1 try: - world_size = paddle.distributed.get_world_size() + world_size = paddle.distributed.get_world_size() except Exception as e: logger.warninig(e) - logger.warninig( - "can not get world_size using paddle.distributed.get_world_size(), use world_size=1" - ) - assert len(shardlist) >= world_size, \ - "the length of shard list should >= number of gpus/xpus/..." + logger.warninig("can not get world_size using paddle.distributed.get_world_size(), use world_size=1") + assert(len(shardlist) >= world_size, "the length of shard list should >= number of gpus/xpus/...") - update_n_iter_processes = int( - max(min(len(shardlist) / world_size - 1, self.n_iter_processes), 0)) + update_n_iter_processes = int(max(min(len(shardlist)/world_size - 1, self.n_iter_processes), 0)) logger.info(f"update_n_iter_processes {update_n_iter_processes}") if update_n_iter_processes != self.n_iter_processes: - self.n_iter_processes = update_n_iter_processes + self.n_iter_processes = update_n_iter_processes logger.info(f"change nun_workers to {self.n_iter_processes}") if self.dist_sampler: base_dataset = streamdata.DataPipeline( - streamdata.SimpleShardList(shardlist), streamdata.split_by_node - if train_mode else streamdata.placeholder(), + streamdata.SimpleShardList(shardlist), + streamdata.split_by_node if train_mode else streamdata.placeholder(), streamdata.split_by_worker, - streamdata.tarfile_to_samples(streamdata.reraise_exception)) + streamdata.tarfile_to_samples(streamdata.reraise_exception) + ) else: base_dataset = streamdata.DataPipeline( streamdata.SimpleShardList(shardlist), streamdata.split_by_worker, - streamdata.tarfile_to_samples(streamdata.reraise_exception)) + streamdata.tarfile_to_samples(streamdata.reraise_exception) + ) self.dataset = base_dataset.append_list( streamdata.audio_tokenize(symbol_table), - streamdata.audio_data_filter( - frame_shift=frame_shift, - max_length=maxlen_in, - min_length=minlen_in, - token_max_length=maxlen_out, - token_min_length=minlen_out), + streamdata.audio_data_filter(frame_shift=frame_shift, max_length=maxlen_in, min_length=minlen_in, token_max_length=maxlen_out, token_min_length=minlen_out), streamdata.audio_resample(resample_rate=resample_rate), - streamdata.audio_compute_fbank( - num_mel_bins=num_mel_bins, - frame_length=frame_length, - frame_shift=frame_shift, - dither=dither), - streamdata.audio_spec_aug(**augment_conf) - if train_mode else streamdata.placeholder( - ), # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80) + streamdata.audio_compute_fbank(num_mel_bins=num_mel_bins, frame_length=frame_length, frame_shift=frame_shift, dither=dither), + streamdata.audio_spec_aug(**augment_conf) if train_mode else streamdata.placeholder(), # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80) streamdata.shuffle(shuffle_size), streamdata.sort(sort_size=sort_size), streamdata.batched(batch_size), streamdata.audio_padding(), - streamdata.audio_cmvn(cmvn_file)) + streamdata.audio_cmvn(cmvn_file) + ) if paddle.__version__ >= '2.3.2': self.loader = streamdata.WebLoader( - self.dataset, - num_workers=self.n_iter_processes, - prefetch_factor=self.prefetch_factor, - batch_size=None) + self.dataset, + num_workers=self.n_iter_processes, + prefetch_factor = self.prefetch_factor, + batch_size=None + ) else: self.loader = streamdata.WebLoader( - self.dataset, - num_workers=self.n_iter_processes, - batch_size=None) + self.dataset, + num_workers=self.n_iter_processes, + batch_size=None + ) def __iter__(self): return self.loader.__iter__() @@ -199,9 +188,7 @@ class StreamDataLoader(): return self.__iter__() def __len__(self): - logger.info( - "Stream dataloader does not support calculate the length of the dataset" - ) + logger.info("Stream dataloader does not support calculate the length of the dataset") return -1 @@ -360,7 +347,7 @@ class DataLoaderFactory(): config['train_mode'] = True elif mode == 'valid': config['manifest'] = config.dev_manifest - config['train_mode'] = False + config['train_mode'] = False elif model == 'test' or mode == 'align': config['manifest'] = config.test_manifest config['train_mode'] = False @@ -371,31 +358,30 @@ class DataLoaderFactory(): config['maxlen_out'] = float('inf') config['dist_sampler'] = False else: - raise KeyError( - "not valid mode type!!, please input one of 'train, valid, test, align'" - ) + raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'") return StreamDataLoader( - manifest_file=config.manifest, - train_mode=config.train_mode, - unit_type=config.unit_type, - preprocess_conf=config.preprocess_config, - batch_size=config.batch_size, - num_mel_bins=config.feat_dim, - frame_length=config.window_ms, - frame_shift=config.stride_ms, - dither=config.dither, - minlen_in=config.minlen_in, - maxlen_in=config.maxlen_in, - minlen_out=config.minlen_out, - maxlen_out=config.maxlen_out, - resample_rate=config.resample_rate, - shuffle_size=config.shuffle_size, - sort_size=config.sort_size, - n_iter_processes=config.num_workers, - prefetch_factor=config.prefetch_factor, - dist_sampler=config.dist_sampler, - cmvn_file=config.cmvn_file, - vocab_filepath=config.vocab_filepath, ) + manifest_file=config.manifest, + train_mode=config.train_mode, + unit_type=config.unit_type, + preprocess_conf=config.preprocess_config, + batch_size=config.batch_size, + num_mel_bins=config.feat_dim, + frame_length=config.window_ms, + frame_shift=config.stride_ms, + dither=config.dither, + minlen_in=config.minlen_in, + maxlen_in=config.maxlen_in, + minlen_out=config.minlen_out, + maxlen_out=config.maxlen_out, + resample_rate=config.resample_rate, + shuffle_size=config.shuffle_size, + sort_size=config.sort_size, + n_iter_processes=config.num_workers, + prefetch_factor=config.prefetch_factor, + dist_sampler=config.dist_sampler, + cmvn_file=config.cmvn_file, + vocab_filepath=config.vocab_filepath, + ) else: if mode == 'train': config['manifest'] = config.train_manifest @@ -425,7 +411,7 @@ class DataLoaderFactory(): config['train_mode'] = False config['sortagrad'] = False config['batch_size'] = config.get('decode', dict()).get( - 'decode_batch_size', 1) + 'decode_batch_size', 1) config['maxlen_in'] = float('inf') config['maxlen_out'] = float('inf') config['minibatches'] = 0 @@ -441,10 +427,8 @@ class DataLoaderFactory(): config['dist_sampler'] = False config['shortest_first'] = False else: - raise KeyError( - "not valid mode type!!, please input one of 'train, valid, test, align'" - ) - + raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'") + return BatchDataLoader( json_file=config.manifest, train_mode=config.train_mode, @@ -466,3 +450,4 @@ class DataLoaderFactory(): num_encs=config.num_encs, dist_sampler=config.dist_sampler, shortest_first=config.shortest_first) + diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py index 5e018befb..44e452bb0 100644 --- a/paddlespeech/s2t/io/reader.py +++ b/paddlespeech/s2t/io/reader.py @@ -120,6 +120,7 @@ class LoadInputsAndTargets(): x = self._get_from_loader( filepath=inp["feat"], filetype=inp.get("filetype", "mat")) + x_feats_dict.setdefault(inp["name"], []).append(x) if self.load_output: @@ -236,6 +237,7 @@ class LoadInputsAndTargets(): :return: :rtype: np.ndarray """ + if filetype == "hdf5": # e.g. # {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL", diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py index b7ee80a7d..4557af86f 100644 --- a/paddlespeech/s2t/models/ds2/deepspeech2.py +++ b/paddlespeech/s2t/models/ds2/deepspeech2.py @@ -271,7 +271,7 @@ class DeepSpeech2Model(nn.Layer): enc_n_units=self.encoder.output_size, blank_id=blank_id, dropout_rate=0.0, - reduction=True, # sum + reduction_type="sum", # sum batch_average=True, # sum / batch_size grad_norm_type=ctc_grad_norm_type) diff --git a/paddlespeech/s2t/models/test.py b/paddlespeech/s2t/models/test.py new file mode 100644 index 000000000..488c386e1 --- /dev/null +++ b/paddlespeech/s2t/models/test.py @@ -0,0 +1,20 @@ +import paddle +import paddle.nn as nn + +class Model(nn.Layer): + def __init__(self): + super().__init__() + self.linear = nn.Linear(1024,1024) + + def forward(self, x): + return self.linear(x) + +model = Model() +x = paddle.uniform([100,1024], dtype='float32') +out = model(x) +loss = paddle.mean(out) +loss.backward() + +clip = nn.ClipGradByGlobalNorm(clip_norm=1.0) +optim = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=model.parameters(), grad_clip=clip) +optim.step() \ No newline at end of file diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py index 813e1e529..b6a4eb7fa 100644 --- a/paddlespeech/s2t/models/u2/u2.py +++ b/paddlespeech/s2t/models/u2/u2.py @@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) - cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. @@ -864,7 +864,7 @@ class U2Model(U2DecodeModel): enc_n_units=encoder.output_size(), blank_id=0, dropout_rate=dropout_rate, - reduction=True, # sum + reduction_type="sum", # sum batch_average=True, # sum / batch_size grad_norm_type=grad_norm_type) diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py index e8b61bc0d..81ae43184 100644 --- a/paddlespeech/s2t/models/u2_st/u2_st.py +++ b/paddlespeech/s2t/models/u2_st/u2_st.py @@ -18,6 +18,7 @@ Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recogni """ import time from typing import Dict +from typing import List from typing import Optional from typing import Tuple @@ -25,8 +26,6 @@ import paddle from paddle import jit from paddle import nn -from paddlespeech.audio.utils.tensor_utils import add_sos_eos -from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.frontend.utility import IGNORE_ID from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.modules.cmvn import GlobalCMVN @@ -39,6 +38,8 @@ from paddlespeech.s2t.modules.mask import subsequent_mask from paddlespeech.s2t.utils import checkpoint from paddlespeech.s2t.utils import layer_tools from paddlespeech.s2t.utils.log import Log +from paddlespeech.audio.utils.tensor_utils import add_sos_eos +from paddlespeech.audio.utils.tensor_utils import th_accuracy from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ["U2STModel", "U2STInferModel"] @@ -400,8 +401,8 @@ class U2STBaseModel(nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), - cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Export interface for c++ call, give input chunk xs, and return output from time 0 to current chunk. @@ -434,8 +435,8 @@ class U2STBaseModel(nn.Layer): paddle.Tensor: new conformer cnn cache required for next chunk, with same shape as the original cnn_cache. """ - return self.encoder.forward_chunk(xs, offset, required_cache_size, - att_cache, cnn_cache) + return self.encoder.forward_chunk( + xs, offset, required_cache_size, att_cache, cnn_cache) # @jit.to_static def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: @@ -611,7 +612,7 @@ class U2STModel(U2STBaseModel): enc_n_units=encoder.output_size(), blank_id=0, dropout_rate=dropout_rate, - reduction=True, # sum + reduction_type='sum', # sum batch_average=True, # sum / batch_size grad_norm_type=grad_norm_type) diff --git a/paddlespeech/s2t/models/wav2vec2/__init__.py b/paddlespeech/s2t/models/wav2vec2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/paddlespeech/s2t/models/wav2vec2/activations.py b/paddlespeech/s2t/models/wav2vec2/activations.py new file mode 100644 index 000000000..0158e8cb0 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/activations.py @@ -0,0 +1,175 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +from packaging import version +from paddle import Tensor, nn + + +from paddlespeech.s2t.utils.log import Log +logger = Log(__name__).getlog() + + +class NewGELUActivation(nn.Layer): + """ + Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see + the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def forward(self, input: Tensor) -> Tensor: + return 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0)))) + + +class GELUActivation(nn.Layer): + """ + Original Implementation of the GELU activation function in Google BERT repo when initially created. For + information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional + Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, use_gelu_python: bool = False): + super().__init__() + self.act = nn.functional.gelu + + def _gelu_python(self, input: Tensor) -> Tensor: + return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0))) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +class FastGELUActivation(nn.Layer): + """ + Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs + """ + + def forward(self, input: Tensor) -> Tensor: + return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input))) + + +class QuickGELUActivation(nn.Layer): + """ + Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs + """ + + def forward(self, input: Tensor) -> Tensor: + return input * paddle.sigmoid(1.702 * input) + + +class ClippedGELUActivation(nn.Layer): + """ + Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as + it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to + https://arxiv.org/abs/2004.09602. + + Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when + initially created. + + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, min: float, max: float): + if min > max: + raise ValueError(f"min should be < max (got min: {min}, max: {max})") + + super().__init__() + self.min = min + self.max = max + + def forward(self, x: Tensor) -> Tensor: + return paddle.clip(gelu(x), self.min, self.max) + + +class SiLUActivation(nn.Layer): + """ + See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear + Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function + Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated + Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with + later. + """ + + def __init__(self): + super().__init__() + self.act = nn.functional.silu + + def _silu_python(self, input: Tensor) -> Tensor: + return input * paddle.sigmoid(input) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +class MishActivation(nn.Layer): + """ + See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also + visit the official repository for the paper: https://github.com/digantamisra98/Mish + """ + + def __init__(self): + super().__init__() + self.act = nn.functional.mish + + def _mish_python(self, input: Tensor) -> Tensor: + return input * paddle.tanh(nn.functional.softplus(input)) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +class LinearActivation(nn.Layer): + """ + Applies the linear activation function, i.e. forwarding input directly to output. + """ + + def forward(self, input: Tensor) -> Tensor: + return input + + +ACT2FN = { + "gelu": GELUActivation(), + "gelu_10": ClippedGELUActivation(-10, 10), + "gelu_fast": FastGELUActivation(), + "gelu_new": NewGELUActivation(), + "gelu_python": GELUActivation(use_gelu_python=True), + "linear": LinearActivation(), + "mish": MishActivation(), + "quick_gelu": QuickGELUActivation(), + "relu": nn.ReLU(), + "sigmoid": nn.Sigmoid(), + "silu": SiLUActivation(), + "swish": SiLUActivation(), + "tanh": nn.Tanh(), +} + + +def get_activation(activation_string): + if activation_string in ACT2FN: + return ACT2FN[activation_string] + else: + raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") + + +# For backwards compatibility with: from activations import gelu_python +gelu_python = get_activation("gelu_python") +gelu_new = get_activation("gelu_new") +gelu = get_activation("gelu") +gelu_fast = get_activation("gelu_fast") +quick_gelu = get_activation("quick_gelu") +silu = get_activation("silu") +mish = get_activation("mish") +linear_act = get_activation("linear") diff --git a/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py b/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py new file mode 100644 index 000000000..a5b509b66 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py @@ -0,0 +1,1129 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Tuple +from collections import OrderedDict + +from dataclasses import fields +import paddle + + +class ModelOutput(OrderedDict): + """ + Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a + tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular + python dictionary. + + + + You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple + before. + + + """ + + def __post_init__(self): + class_fields = fields(self) + + # Safety and consistency checks + if not len(class_fields): + raise ValueError(f"{self.__class__.__name__} has no fields.") + if not all(field.default is None for field in class_fields[1:]): + raise ValueError(f"{self.__class__.__name__} should not have more than one required field.") + + first_field = getattr(self, class_fields[0].name) + other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:]) + + if other_fields_are_none and not paddle.is_tensor(first_field): + if isinstance(first_field, dict): + iterator = first_field.items() + first_field_iterator = True + else: + try: + iterator = iter(first_field) + first_field_iterator = True + except TypeError: + first_field_iterator = False + + # if we provided an iterator as first field and the iterator is a (key, value) iterator + # set the associated fields + if first_field_iterator: + for element in iterator: + if ( + not isinstance(element, (list, tuple)) + or not len(element) == 2 + or not isinstance(element[0], str) + ): + break + setattr(self, element[0], element[1]) + if element[1] is not None: + self[element[0]] = element[1] + elif first_field is not None: + self[class_fields[0].name] = first_field + else: + for field in class_fields: + v = getattr(self, field.name) + if v is not None: + self[field.name] = v + + def __delitem__(self, *args, **kwargs): + raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.") + + def setdefault(self, *args, **kwargs): + raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.") + + def pop(self, *args, **kwargs): + raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") + + def update(self, *args, **kwargs): + raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.") + + def __getitem__(self, k): + if isinstance(k, str): + inner_dict = {k: v for (k, v) in self.items()} + return inner_dict[k] + else: + return self.to_tuple()[k] + + def __setattr__(self, name, value): + if name in self.keys() and value is not None: + # Don't call self.__setitem__ to avoid recursion errors + super().__setitem__(name, value) + super().__setattr__(name, value) + + def __setitem__(self, key, value): + # Will raise a KeyException if needed + super().__setitem__(key, value) + # Don't call self.__setattr__ to avoid recursion errors + super().__setattr__(key, value) + + def to_tuple(self) -> Tuple: + """ + Convert self to a tuple containing all the attributes/keys that are not `None`. + """ + return tuple(self[k] for k in self.keys()) + + +@dataclass +class BaseModelOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class BaseModelOutputWithNoAttention(ModelOutput): + """ + Base class for model's outputs, with potential hidden states. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: paddle = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class BaseModelOutputWithPooling(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: paddle.Tensor = None + pooler_output: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class BaseModelOutputWithPoolingAndNoAttention(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state after a pooling operation on the spatial dimensions. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: paddle.Tensor = None + pooler_output: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class BaseModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class BaseModelOutputWithCrossAttentions(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + """ + + last_hidden_state: paddle.Tensor = None + pooler_output: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class BaseModelOutputWithPastAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class Seq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs. + decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs. + encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + decoder_attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_last_hidden_state: Optional[paddle.Tensor] = None + encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class CausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class CausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class CausalLMOutputWithCrossAttentions(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Cross attentions weights after the attention softmax, used to compute the weighted average in the + cross-attention heads. + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key, + value states of the self-attention and the cross-attention layers if model is used in encoder-decoder + setting. Only relevant if `config.is_decoder = True`. + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class SequenceClassifierOutputWithPast(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class MaskedLMOutput(ModelOutput): + """ + Base class for masked language models outputs. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Masked language modeling (MLM) loss. + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class Seq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + decoder_attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_last_hidden_state: Optional[paddle.Tensor] = None + encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class NextSentencePredictorOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `next_sentence_label` is provided): + Next sequence prediction (classification) loss. + logits (`paddle.Tensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class SequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class Seq2SeqSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence sentence classification models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + decoder_attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_last_hidden_state: Optional[paddle.Tensor] = None + encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class MultipleChoiceModelOutput(ModelOutput): + """ + Base class for outputs of multiple choice models. + + Args: + loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided): + Classification loss. + logits (`paddle.Tensor` of shape `(batch_size, num_choices)`): + *num_choices* is the second dimension of the input tensors. (see *input_ids* above). + + Classification scores (before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class TokenClassifierOutput(ModelOutput): + """ + Base class for outputs of token classification models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) : + Classification loss. + logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class QuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + start_logits: paddle.Tensor = None + end_logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence question answering models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[paddle.Tensor] = None + start_logits: paddle.Tensor = None + end_logits: paddle.Tensor = None + past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None + decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + decoder_attentions: Optional[Tuple[paddle.Tensor]] = None + cross_attentions: Optional[Tuple[paddle.Tensor]] = None + encoder_last_hidden_state: Optional[paddle.Tensor] = None + encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None + encoder_attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class SemanticSegmenterOutput(ModelOutput): + """ + Base class for outputs of semantic segmentation models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`paddle.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`): + Classification scores for each pixel. + + + + The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is + to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the + original image size as post-processing. You should always check your logits shape and resize as needed. + + + + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class ImageClassifierOutput(ModelOutput): + """ + Base class for outputs of image classification models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states + (also called feature maps) of the model at the output of each stage. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class ImageClassifierOutputWithNoAttention(ModelOutput): + """ + Base class for outputs of image classification models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also + called feature maps) of the model at the output of each stage. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class DepthEstimatorOutput(ModelOutput): + """ + Base class for outputs of depth estimation models. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + predicted_depth (`paddle.Tensor` of shape `(batch_size, height, width)`): + Predicted depth for each pixel. + + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + predicted_depth: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class Wav2Vec2BaseModelOutput(ModelOutput): + """ + Base class for models that have been trained with the Wav2Vec2 loss objective. + + Args: + last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + extract_features (`paddle.Tensor` of shape `(batch_size, sequence_length, conv_dim[-1])`): + Sequence of extracted feature vectors of the last convolutional layer of the model. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: paddle.Tensor = None + extract_features: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + + +@dataclass +class XVectorOutput(ModelOutput): + """ + Output type of [`Wav2Vec2ForXVector`]. + + Args: + loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification loss. + logits (`paddle.Tensor` of shape `(batch_size, config.xvector_output_dim)`): + Classification hidden states before AMSoftmax. + embeddings (`paddle.Tensor` of shape `(batch_size, config.xvector_output_dim)`): + Utterance embeddings used for vector similarity-based retrieval. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[paddle.Tensor] = None + logits: paddle.Tensor = None + embeddings: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None diff --git a/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py b/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py new file mode 100755 index 000000000..5accff120 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py @@ -0,0 +1,1259 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Wav2Vec2 model.""" + +import math +import warnings +import paddle +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np +from paddle import nn + +from paddlespeech.s2t.models.wav2vec2.activations import ACT2FN +from paddlespeech.s2t.models.wav2vec2.modeling_outputs import ( + BaseModelOutput, + Wav2Vec2BaseModelOutput, + ModelOutput +) + + +from paddlespeech.s2t.utils.log import Log +logger = Log(__name__).getlog() + + +@dataclass +class Wav2Vec2ForPreTrainingOutput(ModelOutput): + """ + Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions. + + Args: + loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`): + Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official + paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss. + projected_states (`paddle.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked + projected quantized states. + projected_quantized_states (`paddle.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`): + Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive + target vectors for contrastive loss. + hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`): + The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . + diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`): + The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) . + """ + + loss: Optional[paddle.Tensor] = None + projected_states: paddle.Tensor = None + projected_quantized_states: paddle.Tensor = None + codevector_perplexity: paddle.Tensor = None + hidden_states: Optional[Tuple[paddle.Tensor]] = None + attentions: Optional[Tuple[paddle.Tensor]] = None + contrastive_loss: Optional[paddle.Tensor] = None + diversity_loss: Optional[paddle.Tensor] = None + + +def _compute_mask_indices( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + attention_mask: Optional[paddle.Tensor] = None, + min_masks: int = 0, +) -> np.ndarray: + """ + Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for + ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on + CPU as part of the preprocessing during training. + + Args: + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. + mask_length: size of the mask + min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. + """ + batch_size, sequence_length = shape + + if mask_length < 1: + raise ValueError("`mask_length` has to be bigger than 0.") + + if mask_length > sequence_length: + raise ValueError( + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" + ) + + # epsilon is used for probabilistic rounding + epsilon = np.random.rand(1).item() + + def compute_num_masked_span(input_length): + """Given input length, compute how many spans should be masked""" + num_masked_span = int(mask_prob * input_length / mask_length + epsilon) + num_masked_span = max(num_masked_span, min_masks) + + # make sure num masked span <= sequence_length + if num_masked_span * mask_length > sequence_length: + num_masked_span = sequence_length // mask_length + + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + + return num_masked_span + + # compute number of masked spans in batch + input_lengths = ( + attention_mask.sum(-1).detach().tolist() + if attention_mask is not None + else [sequence_length for _ in range(batch_size)] + ) + + # SpecAugment mask to fill + spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool) + spec_aug_mask_idxs = [] + + max_num_masked_span = compute_num_masked_span(sequence_length) + + if max_num_masked_span == 0: + return spec_aug_mask + + for input_length in input_lengths: + # compute num of masked spans for this input + num_masked_span = compute_num_masked_span(input_length) + + # get random indices to mask + spec_aug_mask_idx = np.random.choice( + np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False + ) + + # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] + + spec_aug_mask_idx = np.concatenate( + [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] + ) + spec_aug_mask_idxs.append(spec_aug_mask_idx) + + spec_aug_mask_idxs = np.array(spec_aug_mask_idxs) + + # expand masked indices to masked spans + spec_aug_mask_idxs = np.broadcast_to( + spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length) + ) + spec_aug_mask_idxs = spec_aug_mask_idxs.reshape((batch_size, max_num_masked_span * mask_length)) + + # add offset to the starting indexes so that indexes now create a span + offsets = np.arange(mask_length)[None, None, :] + offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( + (batch_size, max_num_masked_span * mask_length) + ) + spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + + # scatter indices to mask + np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) + + return spec_aug_mask + + +def _sample_negative_indices( + features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None +): + """ + Sample `num_negatives` vectors from feature vectors. + """ + batch_size, sequence_length = features_shape + + # generate indices of the positive vectors themselves, repeat them `num_negatives` times + sequence_length_range = np.arange(sequence_length) + + # get `num_negatives` random vector indices from the same utterance + sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32) + + mask_time_indices = ( + mask_time_indices.astype(np.bool) if mask_time_indices is not None else np.ones(features_shape, dtype=np.bool) + ) + + for batch_idx in range(batch_size): + high = mask_time_indices[batch_idx].sum() - 1 + mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]] + + feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives)) + sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives)) + # avoid sampling the same positive vector, but keep the distribution uniform + sampled_indices[sampled_indices >= feature_indices] += 1 + + # remap to actual indices + sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices] + + # correct for batch size + sampled_negative_indices[batch_idx] += batch_idx * sequence_length + + return sampled_negative_indices + + +class Wav2Vec2NoLayerNormConvLayer(nn.Layer): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1D( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias_attr=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2LayerNormConvLayer(nn.Layer): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1D( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias_attr=config.conv_bias, + ) + self.layer_norm = nn.LayerNorm(self.out_conv_dim) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = hidden_states.transpose([0, 2, 1]) + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states.transpose([0, 2, 1]) + + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2GroupNormConvLayer(nn.Layer): + def __init__(self, config, layer_id=0): + super().__init__() + self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1 + self.out_conv_dim = config.conv_dim[layer_id] + + self.conv = nn.Conv1D( + self.in_conv_dim, + self.out_conv_dim, + kernel_size=config.conv_kernel[layer_id], + stride=config.conv_stride[layer_id], + bias_attr=config.conv_bias, + ) + self.activation = ACT2FN[config.feat_extract_activation] + + self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class Wav2Vec2PositionalConvEmbedding(nn.Layer): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1D( + config.hidden_size, + config.hidden_size, + kernel_size=config.num_conv_pos_embeddings, + padding=config.num_conv_pos_embeddings // 2, + groups=config.num_conv_pos_embedding_groups, + ) + + self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2) + + self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings) + self.activation = ACT2FN[config.feat_extract_activation] + + def forward(self, hidden_states): + hidden_states = hidden_states.transpose([0, 2, 1]) + + hidden_states = self.conv(hidden_states) + hidden_states = self.padding(hidden_states) + hidden_states = self.activation(hidden_states) + + hidden_states = hidden_states.transpose([0, 2, 1]) + return hidden_states + + +class Wav2Vec2SamePadLayer(nn.Layer): + def __init__(self, num_conv_pos_embeddings): + super().__init__() + self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0 + + def forward(self, hidden_states): + if self.num_pad_remove > 0: + hidden_states = hidden_states[:, :, : -self.num_pad_remove] + return hidden_states + + +class Wav2Vec2FeatureEncoder(nn.Layer): + """Construct the features from raw audio waveform""" + + def __init__(self, config): + super().__init__() + + if config.feat_extract_norm == "group": + conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [ + Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1) + ] + elif config.feat_extract_norm == "layer": + conv_layers = [ + Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers) + ] + else: + raise ValueError( + f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']" + ) + self.conv_layers = nn.LayerList(conv_layers) + self.gradient_checkpointing = False + self._requires_grad = True + + def _freeze_parameters(self): + for param in self.parameters(): + param.requires_grad = False + self._requires_grad = False + + def forward(self, input_values): + hidden_states = input_values[:, None] + + # make sure hidden_states require grad for gradient_checkpointing + #if self._requires_grad and self.training: + # hidden_states.requires_grad = True + + for conv_layer in self.conv_layers: + hidden_states = conv_layer(hidden_states) + + return hidden_states + + +class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + +class Wav2Vec2FeatureProjection(nn.Layer): + def __init__(self, config): + super().__init__() + self.layer_norm = nn.LayerNorm(config.conv_dim[-1], epsilon=config.layer_norm_eps) + self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size) + self.dropout = nn.Dropout(config.feat_proj_dropout) + + def forward(self, hidden_states): + # non-projected hidden states are needed for quantization + norm_hidden_states = self.layer_norm(hidden_states) + hidden_states = self.projection(norm_hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states, norm_hidden_states + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2 +class Wav2Vec2Attention(nn.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias) + + def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int): + return paddle.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)).transpose([0, 2, 1, 3]) + + def forward( + self, + hidden_states: paddle.Tensor, + key_value_states: Optional[paddle.Tensor] = None, + past_key_value: Optional[Tuple[paddle.Tensor]] = None, + attention_mask: Optional[paddle.Tensor] = None, + layer_head_mask: Optional[paddle.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.shape + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = paddle.concat([past_key_value[0], key_states], axis=2) + value_states = paddle.concat([past_key_value[1], value_states], axis=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(paddle.Tensor, paddle.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(paddle.Tensor, paddle.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape) + key_states = key_states.reshape(proj_shape) + value_states = value_states.reshape(proj_shape) + + src_len = key_states.shape[1] + attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1])) + + + if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]: + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.shape}" + ) + + if attention_mask is not None: + if attention_mask.shape != [bsz, 1, tgt_len, src_len]: + raise ValueError( + f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is {attention_mask.shape}" + ) + attn_weights = attn_weights.reshape(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.reshape(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, axis=- 1) + + if layer_head_mask is not None: + if layer_head_mask.shape != [self.num_heads,]: + raise ValueError( + f"Head mask for a single layer should be of size {[self.num_heads,]}, but is" + f" {layer_head_mask.shape}" + ) + attn_weights = layer_head_mask.reshape((1, -1, 1, 1)) * attn_weights.reshape((bsz, self.num_heads, tgt_len, src_len)) + attn_weights = attn_weights.reshape((bsz * self.num_heads, tgt_len, src_len)) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.reshape((bsz, self.num_heads, tgt_len, src_len)) + attn_weights = attn_weights_reshaped.reshape((bsz * self.num_heads, tgt_len, src_len)) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = paddle.bmm(attn_probs, value_states) + + if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]: + raise ValueError( + f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is" + f" {attn_output.shape}" + ) + + attn_output = attn_output.reshape((bsz, self.num_heads, tgt_len, self.head_dim)) + attn_output = attn_output.transpose([0, 2, 1, 3]) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned aross GPUs when using tensor-parallelism. + attn_output = attn_output.reshape((bsz, tgt_len, self.embed_dim)) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class Wav2Vec2FeedForward(nn.Layer): + def __init__(self, config): + super().__init__() + self.intermediate_dropout = nn.Dropout(config.activation_dropout) + + self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.output_dropout = nn.Dropout(config.hidden_dropout) + + def forward(self, hidden_states): + hidden_states = self.intermediate_dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.intermediate_dropout(hidden_states) + + hidden_states = self.output_dense(hidden_states) + hidden_states = self.output_dropout(hidden_states) + return hidden_states + + +class Wav2Vec2EncoderLayer(nn.Layer): + def __init__(self, config): + super().__init__() + self.attention = Wav2Vec2Attention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=False, + ) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.feed_forward = Wav2Vec2FeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + + def forward(self, hidden_states, attention_mask=None, output_attentions=False): + attn_residual = hidden_states + hidden_states, attn_weights, _ = self.attention( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = self.dropout(hidden_states) + hidden_states = attn_residual + hidden_states + + hidden_states = self.layer_norm(hidden_states) + hidden_states = hidden_states + self.feed_forward(hidden_states) + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Wav2Vec2EncoderLayerStableLayerNorm(nn.Layer): + def __init__(self, config): + super().__init__() + self.attention = Wav2Vec2Attention( + embed_dim=config.hidden_size, + num_heads=config.num_attention_heads, + dropout=config.attention_dropout, + is_decoder=False, + ) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.feed_forward = Wav2Vec2FeedForward(config) + self.final_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + + def forward(self, hidden_states, attention_mask=None, output_attentions=False): + attn_residual = hidden_states + hidden_states = self.layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.attention( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = self.dropout(hidden_states) + hidden_states = attn_residual + hidden_states + hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states)) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Wav2Vec2Encoder(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.LayerList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens output 0 + expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2]) + hidden_states[~expand_attention_mask] = 0 + + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * np.iinfo(np.float32).min #torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + position_embeddings = self.pos_conv_embed(hidden_states) + hidden_states = hidden_states + position_embeddings + hidden_states = self.layer_norm(hidden_states) + hidden_states = self.dropout(hidden_states) + + #deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() + + for layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = np.random.uniform(0, 1) + + skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False + if not skip_the_layer:# or deepspeed_zero3_is_enabled: + # under deepspeed zero3 all gpus must run in sync + if self.gradient_checkpointing and self.training: + # create gradient checkpointing function + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + else: + layer_outputs = layer( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class Wav2Vec2EncoderStableLayerNorm(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config) + self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout) + self.layers = nn.LayerList( + [Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if attention_mask is not None: + # make sure padded tokens are not attended to + expand_attention_mask = attention_mask.unsqueeze(-1).repeat_interleave(hidden_states.shape[2], axis=2) + hidden_states[~expand_attention_mask] = 0 + + # extend attention_mask + attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype) + attention_mask = attention_mask * np.iinfo(np.float32).min # torch.finfo(hidden_states.dtype).min + attention_mask = attention_mask.expand( + attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] + ) + + position_embeddings = self.pos_conv_embed(hidden_states) + hidden_states = hidden_states + position_embeddings + hidden_states = self.dropout(hidden_states) + + #deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() + + for layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = np.random.uniform(0, 1) + + skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False + if not skip_the_layer:# or deepspeed_zero3_is_enabled: + # under deepspeed zero3 all gpus must run in sync + # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication + if self.gradient_checkpointing and self.training: + # create gradient checkpointing function + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + else: + layer_outputs = layer( + hidden_states, attention_mask=attention_mask, output_attentions=output_attentions + ) + hidden_states = layer_outputs[0] + + if skip_the_layer: + layer_outputs = (None, None) + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class Wav2Vec2GumbelVectorQuantizer(nn.Layer): + """ + Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH + GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information. + """ + + def __init__(self, config): + super().__init__() + self.num_groups = config.num_codevector_groups + self.num_vars = config.num_codevectors_per_group + + if config.codevector_dim % self.num_groups != 0: + raise ValueError( + f"`config.codevector_dim {config.codevector_dim} must be divisible " + f"by `config.num_codevector_groups` {self.num_groups} for concatenation" + ) + + # storage for codebook variables (codewords) + self.codevectors = paddle.static.create_parameter( + shape=[1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups], dtype='float32' + ) + self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars) + + # can be decayed for training + self.temperature = 2 + + @staticmethod + def _compute_perplexity(probs, mask=None): + if mask is not None: + mask_extended = mask.flatten()[:, None, None].expand(probs.shape) + probs = paddle.where(mask_extended, probs, paddle.zeros_like(probs)) + marginal_probs = probs.sum(dim=0) / mask.sum() + else: + marginal_probs = probs.mean(dim=0) + + perplexity = paddle.exp(-paddle.sum(marginal_probs * paddle.log(marginal_probs + 1e-7), dim=-1)).sum() + return perplexity + + def forward(self, hidden_states, mask_time_indices=None): + batch_size, sequence_length, hidden_size = hidden_states.shape + + # project to codevector dim + hidden_states = self.weight_proj(hidden_states) + hidden_states = hidden_states.reshape((batch_size * sequence_length * self.num_groups, -1)) + + if self.training: + # sample code vector probs via gumbel in differentiateable way + codevector_probs = nn.functional.gumbel_softmax( + hidden_states.float(), tau=self.temperature, hard=True + ).type_as(hidden_states) + + # compute perplexity + codevector_soft_dist = paddle.softmax( + hidden_states.reshape((batch_size * sequence_length, self.num_groups, -1)).float(), axis=-1 + ) + perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices) + else: + # take argmax in non-differentiable way + # comptute hard codevector distribution (one hot) + codevector_idx = hidden_states.argmax(dim=-1) + codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_( + -1, codevector_idx.reshape((-1, 1)), 1.0 + ) + codevector_probs = codevector_probs.reshape((batch_size * sequence_length, self.num_groups, -1)) + + perplexity = self._compute_perplexity(codevector_probs, mask_time_indices) + + codevector_probs = codevector_probs.reshape((batch_size * sequence_length, -1)) + # use probs to retrieve codevectors + codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors + codevectors = codevectors_per_group.reshape((batch_size * sequence_length, self.num_groups, self.num_vars, -1)) + codevectors = codevectors.sum(-2).reshape((batch_size, sequence_length, -1)) + + return codevectors, perplexity + + +class Wav2Vec2Adapter(nn.Layer): + def __init__(self, config): + super().__init__() + + # feature dim might need to be down-projected + if config.output_hidden_size != config.hidden_size: + self.proj = nn.Linear(config.hidden_size, config.output_hidden_size) + self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size) + else: + self.proj = self.proj_layer_norm = None + + self.layers = nn.LayerList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers)) + self.layerdrop = config.layerdrop + + def forward(self, hidden_states): + # down project hidden_states if necessary + if self.proj is not None and self.proj_layer_norm is not None: + hidden_states = self.proj(hidden_states) + hidden_states = self.proj_layer_norm(hidden_states) + + hidden_states = hidden_states.transpose([0, 2, 1]) + + for layer in self.layers: + layerdrop_prob = np.random.random() + if not self.training or (layerdrop_prob > self.layerdrop): + hidden_states = layer(hidden_states) + + hidden_states = hidden_states.transpose([0, 2, 1]) + return hidden_states + + +class Wav2Vec2AdapterLayer(nn.Layer): + def __init__(self, config): + super().__init__() + self.conv = nn.Conv1D( + config.output_hidden_size, + 2 * config.output_hidden_size, + config.adapter_kernel_size, + stride=config.adapter_stride, + padding=1, + ) + + def forward(self, hidden_states): + hidden_states = self.conv(hidden_states) + hidden_states = nn.functional.glu(hidden_states, axis=1) + + return hidden_states + + +class Wav2Vec2Model(nn.Layer): + def __init__(self, config): + super().__init__() + self.config = config + self.feature_extractor = Wav2Vec2FeatureEncoder(config) + self.feature_projection = Wav2Vec2FeatureProjection(config) + + # model only needs masking vector if mask prob is > 0.0 + if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: + # self.masked_spec_embed = nn.Parameter(paddle.Tensor(config.hidden_size).uniform_()) + #self.masked_spec_embed = paddle.uniform([config.hidden_size]) + self.masked_spec_embed = paddle.static.create_parameter(shape=[config.hidden_size], dtype='float32', default_initializer=paddle.nn.initializer.Uniform(low=0, high=1.0)) + if config.do_stable_layer_norm: + self.encoder = Wav2Vec2EncoderStableLayerNorm(config) + else: + self.encoder = Wav2Vec2Encoder(config) + + self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None + + # Initialize weights and apply final processing + self.post_init() + + def freeze_feature_extractor(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.feature_extractor._freeze_parameters() + + def _mask_hidden_states( + self, + hidden_states: paddle.Tensor, + mask_time_indices: Optional[paddle.Tensor] = None, + attention_mask: Optional[paddle.Tensor] = None, + ): + """ + Masks extracted features along time axis and/or along feature axis according to + [SpecAugment](https://arxiv.org/abs/1904.08779). + """ + + # `config.apply_spec_augment` can set masking to False + if not getattr(self.config, "apply_spec_augment", True): + return hidden_states + + # generate indices & apply SpecAugment along time axis + batch_size, sequence_length, hidden_size = hidden_states.shape + + if mask_time_indices is not None: + # apply SpecAugment along time axis with given mask_time_indices + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + elif self.config.mask_time_prob > 0 and self.training: + mask_time_indices = _compute_mask_indices( + (batch_size, sequence_length), + mask_prob=self.config.mask_time_prob, + mask_length=self.config.mask_time_length, + attention_mask=attention_mask, + min_masks=self.config.mask_time_min_masks, + ) + mask_time_indices = paddle.to_tensor(mask_time_indices, dtype=paddle.bool) + hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) + + if self.config.mask_feature_prob > 0 and self.training: + # generate indices & apply SpecAugment along feature axis + mask_feature_indices = _compute_mask_indices( + (batch_size, hidden_size), + mask_prob=self.config.mask_feature_prob, + mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, + ) + mask_feature_indices = paddle.to_tensor(mask_feature_indices, dtype=paddle.bool) + mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) + hidden_states[mask_feature_indices] = 0 + + return hidden_states + + def forward( + self, + input_values: Optional[paddle.Tensor], + attention_mask: Optional[paddle.Tensor] = None, + mask_time_indices: Optional[paddle.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Wav2Vec2BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + import numpy as np + np.save("data/paddle_input_values.npy", input_values.numpy()) + extract_features = self.feature_extractor(input_values) + extract_features = extract_features.transpose([0, 2, 1]) + + if attention_mask is not None: + # compute reduced attention_mask corresponding to feature vectors + attention_mask = self._get_feature_vector_attention_mask( + extract_features.shape[1], attention_mask, add_adapter=False + ) + np.save("data/paddle_extract_features.npy", extract_features.numpy()) + hidden_states, extract_features = self.feature_projection(extract_features) + np.save("data/paddle_feature_projection.npy", hidden_states.numpy()) + hidden_states = self._mask_hidden_states( + hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask + ) + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = encoder_outputs[0] + np.save("data/paddle_encoder_outputs.npy", hidden_states.numpy()) + + if self.adapter is not None: + hidden_states = self.adapter(hidden_states) + + if not return_dict: + return (hidden_states, extract_features) + encoder_outputs[1:] + + return Wav2Vec2BaseModelOutput( + last_hidden_state=hidden_states, + extract_features=extract_features, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def post_init(self): + """ + A method executed at the end of each Transformer model initialization, to execute code that needs the model's + modules properly initialized (such as weight initialization). + """ + # self.init_weights() + # self._backward_compatibility_gradient_checkpointing() + pass + +class Wav2Vec2ConfigPure(): + model_type = "wav2vec2" + def __init__( + self, + vocab_size=32, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout=0.1, + activation_dropout=0.1, + attention_dropout=0.1, + feat_proj_dropout=0.1, + feat_quantizer_dropout=0.0, + final_dropout=0.1, + layerdrop=0.1, + initializer_range=0.02, + layer_norm_eps=1e-5, + feat_extract_norm="layer", + feat_extract_activation="gelu", + conv_dim=(512, 512, 512, 512, 512, 512, 512), + conv_stride=(5, 2, 2, 2, 2, 2, 2), + conv_kernel=(10, 3, 3, 3, 3, 2, 2), + conv_bias=True, + num_conv_pos_embeddings=128, + num_conv_pos_embedding_groups=16, + do_stable_layer_norm=True, + apply_spec_augment=True, + mask_time_prob=0.05, + mask_time_length=10, + mask_time_min_masks=2, + mask_feature_prob=0.0, + mask_feature_length=10, + mask_feature_min_masks=0, + num_codevectors_per_group=320, + num_codevector_groups=2, + contrastive_logits_temperature=0.1, + num_negatives=100, + codevector_dim=256, + proj_codevector_dim=256, + diversity_loss_weight=0.1, + ctc_loss_reduction="sum", + ctc_zero_infinity=False, + use_weighted_layer_sum=False, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + add_adapter=False, + adapter_kernel_size=3, + adapter_stride=2, + num_adapter_layers=3, + output_hidden_size=None, + **kwargs + ): + self.output_attentions = False + self.output_hidden_states = False + self.use_return_dict = True + + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.hidden_size = hidden_size + self.feat_extract_norm = feat_extract_norm + self.feat_extract_activation = feat_extract_activation + self.conv_dim = list(conv_dim) + self.conv_stride = list(conv_stride) + self.conv_kernel = list(conv_kernel) + self.conv_bias = conv_bias + self.num_conv_pos_embeddings = num_conv_pos_embeddings + self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.num_feat_extract_layers = len(self.conv_dim) + self.num_hidden_layers = num_hidden_layers + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.num_attention_heads = num_attention_heads + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.feat_proj_dropout = feat_proj_dropout + self.final_dropout = final_dropout + self.layerdrop = layerdrop + self.layer_norm_eps = layer_norm_eps + self.initializer_range = initializer_range + self.vocab_size = vocab_size + self.do_stable_layer_norm = do_stable_layer_norm + self.use_weighted_layer_sum = use_weighted_layer_sum + + if ( + (len(self.conv_stride) != self.num_feat_extract_layers) + or (len(self.conv_kernel) != self.num_feat_extract_layers) + or (len(self.conv_dim) != self.num_feat_extract_layers) + ): + raise ValueError( + "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` ==" + " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) =" + f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`," + f" `len(config.conv_kernel) = {len(self.conv_kernel)}`." + ) + + # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779 + self.apply_spec_augment = apply_spec_augment + self.mask_time_prob = mask_time_prob + self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks + self.mask_feature_prob = mask_feature_prob + self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks + + # parameters for pretraining with codevector quantized representations + self.num_codevectors_per_group = num_codevectors_per_group + self.num_codevector_groups = num_codevector_groups + self.contrastive_logits_temperature = contrastive_logits_temperature + self.feat_quantizer_dropout = feat_quantizer_dropout + self.num_negatives = num_negatives + self.codevector_dim = codevector_dim + self.proj_codevector_dim = proj_codevector_dim + self.diversity_loss_weight = diversity_loss_weight + + # ctc loss + self.ctc_loss_reduction = ctc_loss_reduction + self.ctc_zero_infinity = ctc_zero_infinity + + # adapter + self.add_adapter = add_adapter + self.adapter_kernel_size = adapter_kernel_size + self.adapter_stride = adapter_stride + self.num_adapter_layers = num_adapter_layers + self.output_hidden_size = output_hidden_size or hidden_size + + @property + def inputs_to_logits_ratio(self): + return functools.reduce(operator.mul, self.conv_stride, 1) + + +def main(): + config = Wav2Vec2ConfigPure() + model = Wav2Vec2Model(config) + model_dict = model.state_dict() +# checkpoint_path = "wav2vec2_test" +# params_path = checkpoint_path + ".pdparams" +# paddle.save(model_dict, params_path) + revise_params_path = "exp/wav2vec2-large-960h-lv60-self.pdparams" + model_dict_revise = paddle.load(revise_params_path) + model.set_state_dict(model_dict_revise) + model.training = False + model.eval() + input_values = np.load("input_values.npy") + input_values = paddle.to_tensor(input_values) + outputs = model(input_values) + last_hidden_state = outputs.last_hidden_state + extract_features = outputs.extract_features + hidden_states = outputs.hidden_states + attentions = outputs.attentions + print (last_hidden_state) + np.save("paddle_last_hidden_state.npy", last_hidden_state.numpy()) + print ("extract_features") + print (extract_features) + np.save("paddle_extract_features.npy", extract_features.numpy()) + print ("hidden_states") + print (hidden_states) + print ("attentions") + print (attentions) + return + logits = logits.numpy() + np.save("paddle_logits.npy", logits) + +if __name__ == "__main__": + main() diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py new file mode 100644 index 000000000..057be1d46 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py @@ -0,0 +1,359 @@ +import os +import paddle +import speechbrain as sb +from speechbrain.processing.speech_augmentation import ( + SpeedPerturb, + DropFreq, + DropChunk, +) + + +class TimeDomainSpecAugment(paddle.nn.Layer): + """A time-domain approximation of the SpecAugment algorithm. + This augmentation module implements three augmentations in + the time-domain. + 1. Drop chunks of the audio (zero amplitude or white noise) + 2. Drop frequency bands (with band-drop filters) + 3. Speed peturbation (via resampling to slightly different rate) + Arguments + --------- + perturb_prob : float from 0 to 1 + The probability that a batch will have speed perturbation applied. + drop_freq_prob : float from 0 to 1 + The probability that a batch will have frequencies dropped. + drop_chunk_prob : float from 0 to 1 + The probability that a batch will have chunks dropped. + speeds : list of ints + A set of different speeds to use to perturb each batch. + See ``speechbrain.processing.speech_augmentation.SpeedPerturb`` + sample_rate : int + Sampling rate of the input waveforms. + drop_freq_count_low : int + Lowest number of frequencies that could be dropped. + drop_freq_count_high : int + Highest number of frequencies that could be dropped. + drop_chunk_count_low : int + Lowest number of chunks that could be dropped. + drop_chunk_count_high : int + Highest number of chunks that could be dropped. + drop_chunk_length_low : int + Lowest length of chunks that could be dropped. + drop_chunk_length_high : int + Highest length of chunks that could be dropped. + drop_chunk_noise_factor : float + The noise factor used to scale the white noise inserted, relative to + the average amplitude of the utterance. Default 0 (no noise inserted). + Example + ------- + >>> inputs = torch.randn([10, 16000]) + >>> feature_maker = TimeDomainSpecAugment(speeds=[80]) + >>> feats = feature_maker(inputs, torch.ones(10)) + >>> feats.shape + torch.Size([10, 12800]) + """ + + def __init__( + self, + perturb_prob=1.0, + drop_freq_prob=1.0, + drop_chunk_prob=1.0, + speeds=[95, 100, 105], + sample_rate=16000, + drop_freq_count_low=0, + drop_freq_count_high=3, + drop_chunk_count_low=0, + drop_chunk_count_high=5, + drop_chunk_length_low=1000, + drop_chunk_length_high=2000, + drop_chunk_noise_factor=0, + ): + super().__init__() + self.speed_perturb = SpeedPerturb( + perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds + ) + self.drop_freq = DropFreq( + drop_prob=drop_freq_prob, + drop_count_low=drop_freq_count_low, + drop_count_high=drop_freq_count_high, + ) + self.drop_chunk = DropChunk( + drop_prob=drop_chunk_prob, + drop_count_low=drop_chunk_count_low, + drop_count_high=drop_chunk_count_high, + drop_length_low=drop_chunk_length_low, + drop_length_high=drop_chunk_length_high, + noise_factor=drop_chunk_noise_factor, + ) + + def forward(self, waveforms, lengths): + """Returns the distorted waveforms. + Arguments + --------- + waveforms : torch.Tensor + The waveforms to distort + """ + # Augmentation + with paddle.no_grad(): + waveforms = self.speed_perturb(waveforms) + waveforms = self.drop_freq(waveforms) + waveforms = self.drop_chunk(waveforms, lengths) + + return + + +class DropFreq(torch.nn.Module): + """This class drops a random frequency from the signal. + The purpose of this class is to teach models to learn to rely on all parts + of the signal, not just a few frequency bands. + Arguments + --------- + drop_freq_low : float + The low end of frequencies that can be dropped, + as a fraction of the sampling rate / 2. + drop_freq_high : float + The high end of frequencies that can be + dropped, as a fraction of the sampling rate / 2. + drop_count_low : int + The low end of number of frequencies that could be dropped. + drop_count_high : int + The high end of number of frequencies that could be dropped. + drop_width : float + The width of the frequency band to drop, as + a fraction of the sampling_rate / 2. + drop_prob : float + The probability that the batch of signals will have a frequency + dropped. By default, every batch has frequencies dropped. + Example + ------- + >>> from speechbrain.dataio.dataio import read_audio + >>> dropper = DropFreq() + >>> signal = read_audio('tests/samples/single-mic/example1.wav') + >>> dropped_signal = dropper(signal.unsqueeze(0)) + """ + + def __init__( + self, + drop_freq_low=1e-14, + drop_freq_high=1, + drop_count_low=1, + drop_count_high=2, + drop_width=0.05, + drop_prob=1, + ): + super().__init__() + self.drop_freq_low = drop_freq_low + self.drop_freq_high = drop_freq_high + self.drop_count_low = drop_count_low + self.drop_count_high = drop_count_high + self.drop_width = drop_width + self.drop_prob = drop_prob + + def forward(self, waveforms): + """ + Arguments + --------- + waveforms : tensor + Shape should be `[batch, time]` or `[batch, time, channels]`. + Returns + ------- + Tensor of shape `[batch, time]` or `[batch, time, channels]`. + """ + + # Don't drop (return early) 1-`drop_prob` portion of the batches + dropped_waveform = waveforms.clone() + if torch.rand(1) > self.drop_prob: + return dropped_waveform + + # Add channels dimension + if len(waveforms.shape) == 2: + dropped_waveform = dropped_waveform.unsqueeze(-1) + + # Pick number of frequencies to drop + drop_count = torch.randint( + low=self.drop_count_low, high=self.drop_count_high + 1, size=(1,), + ) + + # Pick a frequency to drop + drop_range = self.drop_freq_high - self.drop_freq_low + drop_frequency = ( + torch.rand(drop_count) * drop_range + self.drop_freq_low + ) + + # Filter parameters + filter_length = 101 + pad = filter_length // 2 + + # Start with delta function + drop_filter = torch.zeros(1, filter_length, 1, device=waveforms.device) + drop_filter[0, pad, 0] = 1 + + # Subtract each frequency + for frequency in drop_frequency: + notch_kernel = notch_filter( + frequency, filter_length, self.drop_width, + ).to(waveforms.device) + drop_filter = convolve1d(drop_filter, notch_kernel, pad) + + # Apply filter + dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad) + + # Remove channels dimension if added + return dropped_waveform.squeeze(-1) + +class DropChunk(torch.nn.Module): + """This class drops portions of the input signal. + Using `DropChunk` as an augmentation strategy helps a models learn to rely + on all parts of the signal, since it can't expect a given part to be + present. + Arguments + --------- + drop_length_low : int + The low end of lengths for which to set the + signal to zero, in samples. + drop_length_high : int + The high end of lengths for which to set the + signal to zero, in samples. + drop_count_low : int + The low end of number of times that the signal + can be dropped to zero. + drop_count_high : int + The high end of number of times that the signal + can be dropped to zero. + drop_start : int + The first index for which dropping will be allowed. + drop_end : int + The last index for which dropping will be allowed. + drop_prob : float + The probability that the batch of signals will + have a portion dropped. By default, every batch + has portions dropped. + noise_factor : float + The factor relative to average amplitude of an utterance + to use for scaling the white noise inserted. 1 keeps + the average amplitude the same, while 0 inserts all 0's. + Example + ------- + >>> from speechbrain.dataio.dataio import read_audio + >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.) + >>> signal = read_audio('tests/samples/single-mic/example1.wav') + >>> signal = signal.unsqueeze(0) # [batch, time, channels] + >>> length = torch.ones(1) + >>> dropped_signal = dropper(signal, length) + >>> float(dropped_signal[:, 150]) + 0.0 + """ + + def __init__( + self, + drop_length_low=100, + drop_length_high=1000, + drop_count_low=1, + drop_count_high=10, + drop_start=0, + drop_end=None, + drop_prob=1, + noise_factor=0.0, + ): + super().__init__() + self.drop_length_low = drop_length_low + self.drop_length_high = drop_length_high + self.drop_count_low = drop_count_low + self.drop_count_high = drop_count_high + self.drop_start = drop_start + self.drop_end = drop_end + self.drop_prob = drop_prob + self.noise_factor = noise_factor + + # Validate low < high + if drop_length_low > drop_length_high: + raise ValueError("Low limit must not be more than high limit") + if drop_count_low > drop_count_high: + raise ValueError("Low limit must not be more than high limit") + + # Make sure the length doesn't exceed end - start + if drop_end is not None and drop_end >= 0: + if drop_start > drop_end: + raise ValueError("Low limit must not be more than high limit") + + drop_range = drop_end - drop_start + self.drop_length_low = min(drop_length_low, drop_range) + self.drop_length_high = min(drop_length_high, drop_range) + + def forward(self, waveforms, lengths): + """ + Arguments + --------- + waveforms : tensor + Shape should be `[batch, time]` or `[batch, time, channels]`. + lengths : tensor + Shape should be a single dimension, `[batch]`. + Returns + ------- + Tensor of shape `[batch, time]` or + `[batch, time, channels]` + """ + + # Reading input list + lengths = (lengths * waveforms.size(1)).long() + batch_size = waveforms.size(0) + dropped_waveform = waveforms.clone() + + # Don't drop (return early) 1-`drop_prob` portion of the batches + if torch.rand(1) > self.drop_prob: + return dropped_waveform + + # Store original amplitude for computing white noise amplitude + clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1)) + + # Pick a number of times to drop + drop_times = torch.randint( + low=self.drop_count_low, + high=self.drop_count_high + 1, + size=(batch_size,), + ) + + # Iterate batch to set mask + for i in range(batch_size): + if drop_times[i] == 0: + continue + + # Pick lengths + length = torch.randint( + low=self.drop_length_low, + high=self.drop_length_high + 1, + size=(drop_times[i],), + ) + + # Compute range of starting locations + start_min = self.drop_start + if start_min < 0: + start_min += lengths[i] + start_max = self.drop_end + if start_max is None: + start_max = lengths[i] + if start_max < 0: + start_max += lengths[i] + start_max = max(0, start_max - length.max()) + + # Pick starting locations + start = torch.randint( + low=start_min, high=start_max + 1, size=(drop_times[i],), + ) + + end = start + length + + # Update waveform + if not self.noise_factor: + for j in range(drop_times[i]): + dropped_waveform[i, start[j] : end[j]] = 0.0 + else: + # Uniform distribution of -2 to +2 * avg amplitude should + # preserve the average for normalization + noise_max = 2 * clean_amplitude[i] * self.noise_factor + for j in range(drop_times[i]): + # zero-center the noise distribution + noise_vec = torch.rand(length[j], device=waveforms.device) + noise_vec = 2 * noise_max * noise_vec - noise_max + dropped_waveform[i, start[j] : end[j]] = noise_vec + + return \ No newline at end of file diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py new file mode 100644 index 000000000..8eb56e759 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py @@ -0,0 +1,45 @@ +"""Vanilla Neural Network for simple tests. +Authors +* Elena Rastorgueva 2020 +""" +import paddle +from paddlespeech.s2t.models.wav2vec2.speechbrain.nnet import containers +import paddlespeech.s2t.models.wav2vec2.speechbrain as sb + + +class VanillaNN(containers.Sequential): + """A simple vanilla Deep Neural Network. + Arguments + --------- + activation : paddle class + A class used for constructing the activation layers. + dnn_blocks : int + The number of linear neural blocks to include. + dnn_neurons : int + The number of neurons in the linear layers. + Example + ------- + >>> inputs = paddle.rand([10, 120, 60]) + >>> model = VanillaNN(input_shape=inputs.shape) + >>> outputs = model(inputs) + >>> outputs.shape + paddle.shape([10, 120, 512]) + """ + + def __init__( + self, + input_shape, + activation=paddle.nn.LeakyReLU, + dnn_blocks=2, + dnn_neurons=512, + ): + super().__init__(input_shape=input_shape) + + for block_index in range(dnn_blocks): + self.append( + sb.nnet.linear.Linear, + n_neurons=dnn_neurons, + bias=True, + layer_name="linear", + ) + self.append(activation(), layer_name="act") diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py new file mode 100644 index 000000000..f8f087714 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py @@ -0,0 +1,2 @@ +from . import linear +from . import containers diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py new file mode 100644 index 000000000..078806902 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py @@ -0,0 +1,132 @@ +import paddle +import inspect +import logging +import operator +import functools + +class Sequential(paddle.nn.LayerDict): + """A sequence of modules with potentially inferring shape on construction. + If layers are passed with names, these can be referenced with dot notation. + Arguments + --------- + input_shape : iterable + A list or tuple of ints or None, representing the expected shape of an + input tensor. None represents a variable-length dimension. If no + ``input_shape`` is passed, no shape inference will be performed. + *layers, **named_layers + The inputs are treated as a list of layers to be + applied in sequence. The output shape of each layer is used to + infer the shape of the following layer. If a tuple is returned, + only the shape of the first element is used to determine input + shape of the next layer (e.g. RNN returns output, hidden). + Example + ------- + >>> inputs = paddle.rand(10, 40, 50) + >>> model = Sequential(input_shape=inputs.shape) + >>> model.append(Linear, n_neurons=100, layer_name="layer1") + >>> model.append(Linear, n_neurons=200, layer_name="layer2") + >>> outputs = model(inputs) + >>> outputs.shape + paddle.shape([10, 40, 200]) + >>> outputs = model.layer1(inputs) + >>> outputs.shape + paddle.shape([10, 40, 100]) + """ + + def __init__(self, *layers, input_shape=None, **named_layers): + super().__init__() + + # Make sure either layers or input_shape is passed + if not layers and input_shape is None and not named_layers: + raise ValueError("Must pass either layers or input shape") + + # Keep track of what layers need "lengths" passed + self.length_layers = [] + + # Replace None dimensions with arbitrary value + self.input_shape = input_shape + if input_shape and None in input_shape: + self.input_shape = list(input_shape) + for i, dim in enumerate(self.input_shape): + + # To reduce size of dummy tensors, use 1 for batch dim + if i == 0 and dim is None: + dim = 1 + + # Use 64 as nice round arbitrary value, big enough that + # halving this dimension a few times doesn't reach 1 + self.input_shape[i] = dim or 256 + + # Append non-named layers + for layer in layers: + self.append(layer) + + # Append named layers + for name, layer in named_layers.items(): + self.append(layer, layer_name=name) + + def append(self, layer, *args, layer_name=None, **kwargs): + """Add a layer to the list of layers, inferring shape if necessary. + Arguments + --------- + layer : A paddle.nn.Module class or object + If the layer is a class, it should accept an argument called + ``input_shape`` which will be inferred and passed. If the layer + is a module object, it is added as-is. + layer_name : str + The name of the layer, for reference. If the name is in use, + ``_{count}`` will be appended. + *args, **kwargs + These are passed to the layer if it is constructed. + """ + + # Compute layer_name + if layer_name is None: + layer_name = str(len(self)) + elif layer_name in self: + index = 0 + while f"{layer_name}_{index}" in self: + index += 1 + layer_name = f"{layer_name}_{index}" + + # Check if it needs to be constructed with input shape + if self.input_shape: + argspec = inspect.getfullargspec(layer) + if "input_shape" in argspec.args + argspec.kwonlyargs: + input_shape = self.get_output_shape() + layer = layer(*args, input_shape=input_shape, **kwargs) + + # Finally, append the layer. + try: + self[layer_name] = layer + # self.add_module(layer_name, layer) + except TypeError: + raise ValueError( + "Must pass `input_shape` at initialization and use " + "modules that take `input_shape` to infer shape when " + "using `append()`." + ) + + def get_output_shape(self): + """Returns expected shape of the output. + Computed by passing dummy input constructed with the + ``self.input_shape`` attribute. + """ + with paddle.no_grad(): + dummy_input = paddle.zeros(self.input_shape) + dummy_output = self(dummy_input) + return dummy_output.shape + + def forward(self, x): + """Applies layers in sequence, passing only the first element of tuples. + Arguments + --------- + x : paddle.Tensor + The input tensor to run through the network. + """ + for layer in self.values(): + x = layer(x) + if isinstance(x, tuple): + x = x[0] + + return x diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py new file mode 100644 index 000000000..26389d908 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py @@ -0,0 +1,73 @@ +"""Library implementing linear transformation. +Authors + * Mirco Ravanelli 2020 + * Davide Borra 2021 +""" + +import logging +import paddle +import paddle.nn as nn +from paddlespeech.s2t.modules import align + +logger = logging.getLogger(__name__) + + +class Linear(paddle.nn.Layer): + """Computes a linear transformation y = wx + b. + Arguments + --------- + n_neurons : int + It is the number of output neurons (i.e, the dimensionality of the + output). + input_shape: tuple + It is the shape of the input tensor. + input_size: int + Size of the input tensor. + bias : bool + If True, the additive bias b is adopted. + combine_dims : bool + If True and the input is 4D, combine 3rd and 4th dimensions of input. + Example + ------- + >>> inputs = paddle.rand(10, 50, 40) + >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100) + >>> output = lin_t(inputs) + >>> output.shape + paddle.shape([10, 50, 100]) + """ + + def __init__( + self, + n_neurons, + input_shape=None, + input_size=None, + bias=True, + combine_dims=False, + ): + super().__init__() + self.combine_dims = combine_dims + + if input_shape is None and input_size is None: + raise ValueError("Expected one of input_shape or input_size") + + if input_size is None: + input_size = input_shape[-1] + if len(input_shape) == 4 and self.combine_dims: + input_size = input_shape[2] * input_shape[3] + + # Weights are initialized following paddle approach + self.w = align.Linear(input_size, n_neurons, bias_attr=bias) + + def forward(self, x): + """Returns the linear transformation of input tensor. + Arguments + --------- + x : paddle.Tensor + Input to transform linearly. + """ + if x.rank == 4 and self.combine_dims: + x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]) + + wx = self.w(x) + + return wx diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py new file mode 100644 index 000000000..aeae11c0b --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py @@ -0,0 +1,256 @@ +""" +Low level signal processing utilities +Authors + * Peter Plantinga 2020 + * Francois Grondin 2020 + * William Aris 2020 + * Samuele Cornell 2020 + * Sarthak Yadav 2022 +""" +import paddle +import math +from packaging import version +import numpy as np + +def blackman_window(window_length, periodic=True): + if window_length == 0: + return [] + if window_length == 1: + return paddle.ones([1]) + if periodic: + window_length += 1 + + + + + window = paddle.arange(window_length) * (np.pi / (window_length - 1)) + window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42 + return window[:-1] if periodic else window + + +def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"): + """Compute amplitude of a batch of waveforms. + Arguments + --------- + waveform : tensor + The waveforms used for computing amplitude. + Shape should be `[time]` or `[batch, time]` or + `[batch, time, channels]`. + lengths : tensor + The lengths of the waveforms excluding the padding. + Shape should be a single dimension, `[batch]`. + amp_type : str + Whether to compute "avg" average or "peak" amplitude. + Choose between ["avg", "peak"]. + scale : str + Whether to compute amplitude in "dB" or "linear" scale. + Choose between ["linear", "dB"]. + Returns + ------- + The average amplitude of the waveforms. + Example + ------- + >>> signal = torch.sin(torch.arange(16000.0)).unsqueeze(0) + >>> compute_amplitude(signal, signal.size(1)) + tensor([[0.6366]]) + """ + if len(waveforms.shape) == 1: + waveforms = waveforms.unsqueeze(0) + + assert amp_type in ["avg", "peak"] + assert scale in ["linear", "dB"] + + if amp_type == "avg": + if lengths is None: + out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True) + else: + wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True) + out = wav_sum / lengths + elif amp_type == "peak": + out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0] + else: + raise NotImplementedError + + if scale == "linear": + return out + elif scale == "dB": + return paddle.clip(20 * paddle.log10(out), min=-80) # clamp zeros + else: + raise NotImplementedError + + +def convolve1d( + waveform, + kernel, + padding=0, + pad_type="constant", + stride=1, + groups=1, + use_fft=False, + rotation_index=0, +): + """Use torch.nn.functional to perform 1d padding and conv. + Arguments + --------- + waveform : tensor + The tensor to perform operations on. + kernel : tensor + The filter to apply during convolution. + padding : int or tuple + The padding (pad_left, pad_right) to apply. + If an integer is passed instead, this is passed + to the conv1d function and pad_type is ignored. + pad_type : str + The type of padding to use. Passed directly to + `torch.nn.functional.pad`, see PyTorch documentation + for available options. + stride : int + The number of units to move each time convolution is applied. + Passed to conv1d. Has no effect if `use_fft` is True. + groups : int + This option is passed to `conv1d` to split the input into groups for + convolution. Input channels should be divisible by the number of groups. + use_fft : bool + When `use_fft` is passed `True`, then compute the convolution in the + spectral domain using complex multiply. This is more efficient on CPU + when the size of the kernel is large (e.g. reverberation). WARNING: + Without padding, circular convolution occurs. This makes little + difference in the case of reverberation, but may make more difference + with different kernels. + rotation_index : int + This option only applies if `use_fft` is true. If so, the kernel is + rolled by this amount before convolution to shift the output location. + Returns + ------- + The convolved waveform. + Example + ------- + >>> from speechbrain.dataio.dataio import read_audio + >>> signal = read_audio('tests/samples/single-mic/example1.wav') + >>> signal = signal.unsqueeze(0).unsqueeze(2) + >>> kernel = torch.rand(1, 10, 1) + >>> signal = convolve1d(signal, kernel, padding=(9, 0)) + """ + if len(waveform.shape) != 3: + raise ValueError("Convolve1D expects a 3-dimensional tensor") + + # Move time dimension last, which pad and fft and conv expect. + waveform = waveform.transpose([0, 2, 1]) + kernel = kernel.transpose([0, 2, 1]) + + # Padding can be a tuple (left_pad, right_pad) or an int + if isinstance(padding, tuple): + waveform = paddle.nn.functional.pad( + x=waveform, pad=padding, mode=pad_type, + ) + + # This approach uses FFT, which is more efficient if the kernel is large + if use_fft: + + # Pad kernel to same length as signal, ensuring correct alignment + zero_length = waveform.shape[-1] - kernel.shape[-1] + + # Handle case where signal is shorter + if zero_length < 0: + kernel = kernel[..., :zero_length] + zero_length = 0 + + # Perform rotation to ensure alignment + zeros = paddle.zeros( + kernel.shape[0], kernel.shape[1], zero_length + ) + after_index = kernel[..., rotation_index:] + before_index = kernel[..., :rotation_index] + kernel = paddle.concat((after_index, zeros, before_index), axis=-1) + + # Multiply in frequency domain to convolve in time domain + # if version.parse(torch.__version__) > version.parse("1.6.0"): + import paddle.fft as fft + + result = fft.rfft(waveform) * fft.rfft(kernel) + convolved = fft.irfft(result, n=waveform.shape[-1]) + # else: + # f_signal = torch.rfft(waveform, 1) + # f_kernel = torch.rfft(kernel, 1) + # sig_real, sig_imag = f_signal.unbind(-1) + # ker_real, ker_imag = f_kernel.unbind(-1) + # f_result = torch.stack( + # [ + # sig_real * ker_real - sig_imag * ker_imag, + # sig_real * ker_imag + sig_imag * ker_real, + # ], + # dim=-1, + # ) + # convolved = torch.irfft( + # f_result, 1, signal_sizes=[waveform.size(-1)] + # ) + + # Use the implementation given by torch, which should be efficient on GPU + else: + convolved = paddle.nn.functional.conv1d( + x=waveform, + weight=kernel, + stride=stride, + groups=groups, + padding=padding if not isinstance(padding, tuple) else 0, + ) + + # Return time dimension to the second dimension. + return convolved.transpose([0, 2, 1]) + +def notch_filter(notch_freq, filter_width=101, notch_width=0.05): + """Returns a notch filter constructed from a high-pass and low-pass filter. + (from https://tomroelandts.com/articles/ + how-to-create-simple-band-pass-and-band-reject-filters) + Arguments + --------- + notch_freq : float + frequency to put notch as a fraction of the + sampling rate / 2. The range of possible inputs is 0 to 1. + filter_width : int + Filter width in samples. Longer filters have + smaller transition bands, but are more inefficient. + notch_width : float + Width of the notch, as a fraction of the sampling_rate / 2. + Example + ------- + >>> from speechbrain.dataio.dataio import read_audio + >>> signal = read_audio('tests/samples/single-mic/example1.wav') + >>> signal = signal.unsqueeze(0).unsqueeze(2) + >>> kernel = notch_filter(0.25) + >>> notched_signal = convolve1d(signal, kernel) + """ + + # Check inputs + assert 0 < notch_freq <= 1 + assert filter_width % 2 != 0 + pad = filter_width // 2 + inputs = paddle.arange(filter_width) - pad + + # Avoid frequencies that are too low + notch_freq += notch_width + + # Define sinc function, avoiding division by zero + def sinc(x): + "Computes the sinc function." + + def _sinc(x): + return paddle.sin(x) / x + + # The zero is at the middle index + return paddle.concat([_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1 :])]) + + # Compute a low-pass filter with cutoff frequency notch_freq. + hlpf = sinc(3 * (notch_freq - notch_width) * inputs) + hlpf *= blackman_window(filter_width) + hlpf /= paddle.sum(hlpf) + + # Compute a high-pass filter with cutoff frequency notch_freq. + hhpf = sinc(3 * (notch_freq + notch_width) * inputs) + hhpf *= blackman_window(filter_width) + hhpf /= -paddle.sum(hhpf) + hhpf[pad] += 1 + + # Adding filters creates notch filter + return (hlpf + hhpf).view(1, -1, 1) + diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py new file mode 100644 index 000000000..1cbbe11af --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py @@ -0,0 +1,741 @@ +import math +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddlespeech.s2t.models.wav2vec2.speechbrain.processing.signal_processing import ( + compute_amplitude, + convolve1d, + notch_filter) +import pdb +class SpeedPerturb(nn.Layer): + """Slightly speed up or slow down an audio signal. + Resample the audio signal at a rate that is similar to the original rate, + to achieve a slightly slower or slightly faster signal. This technique is + outlined in the paper: "Audio Augmentation for Speech Recognition" + Arguments + --------- + orig_freq : int + The frequency of the original signal. + speeds : list + The speeds that the signal should be changed to, as a percentage of the + original signal (i.e. `speeds` is divided by 100 to get a ratio). + perturb_prob : float + The chance that the batch will be speed- + perturbed. By default, every batch is perturbed. + Example + ------- + >>> from speechbrain.dataio.dataio import read_audio + >>> signal = read_audio('tests/samples/single-mic/example1.wav') + >>> perturbator = SpeedPerturb(orig_freq=16000, speeds=[90]) + >>> clean = signal.unsqueeze(0) + >>> perturbed = perturbator(clean) + >>> clean.shape + torch.Size([1, 52173]) + >>> perturbed.shape + torch.Size([1, 46956]) + """ + + def __init__( + self, orig_freq, speeds=[90, 100, 110], perturb_prob=1.0, + ): + super().__init__() + self.orig_freq = orig_freq + self.speeds = speeds + self.perturb_prob = perturb_prob + + # Initialize index of perturbation + self.samp_index = 0 + + # Initialize resamplers + self.resamplers = [] + for speed in self.speeds: + config = { + "orig_freq": self.orig_freq, + "new_freq": self.orig_freq * speed // 100, + } + self.resamplers.append(Resample(**config)) + + def forward(self, waveform): + """ + Arguments + --------- + waveforms : tensor + Shape should be `[batch, time]` or `[batch, time, channels]`. + lengths : tensor + Shape should be a single dimension, `[batch]`. + Returns + ------- + Tensor of shape `[batch, time]` or `[batch, time, channels]`. + """ + + # Don't perturb (return early) 1-`perturb_prob` portion of the batches + if paddle.rand([1]) > self.perturb_prob: + + return waveform.clone() + # Perform a random perturbation + self.samp_index = paddle.randint(len(self.speeds), shape=(1,))[0] + perturbed_waveform = self.resamplers[self.samp_index](waveform) + + return perturbed_waveform + +class Resample(nn.Layer): + """This class resamples an audio signal using sinc-based interpolation. + + It is a modification of the `resample` function from torchaudio + (https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html) + + Arguments + --------- + orig_freq : int + the sampling frequency of the input signal. + new_freq : int + the new sampling frequency after this operation is performed. + lowpass_filter_width : int + Controls the sharpness of the filter, larger numbers result in a + sharper filter, but they are less efficient. Values from 4 to 10 are allowed. + + Example + ------- + >>> from speechbrain.dataio.dataio import read_audio + >>> signal = read_audio('tests/samples/single-mic/example1.wav') + >>> signal = signal.unsqueeze(0) # [batch, time, channels] + >>> resampler = Resample(orig_freq=16000, new_freq=8000) + >>> resampled = resampler(signal) + >>> signal.shape + torch.Size([1, 52173]) + >>> resampled.shape + torch.Size([1, 26087]) + """ + + def __init__( + self, orig_freq=16000, new_freq=16000, lowpass_filter_width=6, + ): + super().__init__() + self.orig_freq = orig_freq + self.new_freq = new_freq + self.lowpass_filter_width = lowpass_filter_width + + # Compute rate for striding + self._compute_strides() + assert self.orig_freq % self.conv_stride == 0 + assert self.new_freq % self.conv_transpose_stride == 0 + + def _compute_strides(self): + """Compute the phases in polyphase filter. + + (almost directly from torchaudio.compliance.kaldi) + """ + + # Compute new unit based on ratio of in/out frequencies + base_freq = math.gcd(self.orig_freq, self.new_freq) + input_samples_in_unit = self.orig_freq // base_freq + self.output_samples = self.new_freq // base_freq + + # Store the appropriate stride based on the new units + self.conv_stride = input_samples_in_unit + self.conv_transpose_stride = self.output_samples + + def forward(self, waveforms): + """ + Arguments + --------- + waveforms : tensor + Shape should be `[batch, time]` or `[batch, time, channels]`. + lengths : tensor + Shape should be a single dimension, `[batch]`. + + Returns + ------- + Tensor of shape `[batch, time]` or `[batch, time, channels]`. + """ + + if not hasattr(self, "first_indices"): + self._indices_and_weights(waveforms) + + # Don't do anything if the frequencies are the same + if self.orig_freq == self.new_freq: + return waveforms + unsqueezed = False + if len(waveforms.shape) == 2: + waveforms = waveforms.unsqueeze(1) + unsqueezed = True + elif len(waveforms.shape) == 3: + waveforms = waveforms.transpose([0, 2, 1]) + else: + raise ValueError("Input must be 2 or 3 dimensions") + + # Do resampling + resampled_waveform = self._perform_resample(waveforms) + + if unsqueezed: + resampled_waveform = resampled_waveform.squeeze(1) + else: + resampled_waveform = resampled_waveform.transpose([0, 2, 1]) + + return resampled_waveform + + def _perform_resample(self, waveforms): + """Resamples the waveform at the new frequency. + + This matches Kaldi's OfflineFeatureTpl ResampleWaveform which uses a + LinearResample (resample a signal at linearly spaced intervals to + up/downsample a signal). LinearResample (LR) means that the output + signal is at linearly spaced intervals (i.e the output signal has a + frequency of `new_freq`). It uses sinc/bandlimited interpolation to + upsample/downsample the signal. + + (almost directly from torchaudio.compliance.kaldi) + + https://ccrma.stanford.edu/~jos/resample/ + Theory_Ideal_Bandlimited_Interpolation.html + + https://github.com/kaldi-asr/kaldi/blob/master/src/feat/resample.h#L56 + + Arguments + --------- + waveforms : tensor + The batch of audio signals to resample. + + Returns + ------- + The waveforms at the new frequency. + """ + + # Compute output size and initialize + batch_size, num_channels, wave_len = waveforms.shape + window_size = self.weights.shape[1] + tot_output_samp = self._output_samples(wave_len) + resampled_waveform = paddle.zeros( + (batch_size, num_channels, tot_output_samp) + ) + # self.weights = self.weights.to(waveforms.device) + + # Check weights are on correct device + # if waveforms.device != self.weights.device: + # self.weights = self.weights.to(waveforms.device) + + # eye size: (num_channels, num_channels, 1) + eye = paddle.eye(num_channels).unsqueeze(2) + + # Iterate over the phases in the polyphase filter + for i in range(self.first_indices.shape[0]): + wave_to_conv = waveforms + first_index = int(self.first_indices[i].item()) + if first_index >= 0: + # trim the signal as the filter will not be applied + # before the first_index + wave_to_conv = wave_to_conv[..., first_index:] + + # pad the right of the signal to allow partial convolutions + # meaning compute values for partial windows (e.g. end of the + # window is outside the signal length) + max_index = (tot_output_samp - 1) // self.output_samples + end_index = max_index * self.conv_stride + window_size + current_wave_len = wave_len - first_index + right_padding = max(0, end_index + 1 - current_wave_len) + left_padding = max(0, -first_index) + wave_to_conv = paddle.nn.functional.pad( + wave_to_conv, (left_padding, right_padding), data_format='NCL' + ) + conv_wave = paddle.nn.functional.conv1d( + x=wave_to_conv, + weight=self.weights[i].repeat(num_channels, 1, 1), + stride=self.conv_stride, + groups=num_channels, + ) + + # we want conv_wave[:, i] to be at + # output[:, i + n*conv_transpose_stride] + dilated_conv_wave = paddle.nn.functional.conv1d_transpose( + conv_wave, eye, stride=self.conv_transpose_stride + ) + + # pad dilated_conv_wave so it reaches the output length if needed. + left_padding = i + previous_padding = left_padding + dilated_conv_wave.shape[-1] + right_padding = max(0, tot_output_samp - previous_padding) + dilated_conv_wave = paddle.nn.functional.pad( + dilated_conv_wave, (left_padding, right_padding), data_format='NCL' + ) + dilated_conv_wave = dilated_conv_wave[..., :tot_output_samp] + + resampled_waveform += dilated_conv_wave + + return resampled_waveform + + def _output_samples(self, input_num_samp): + """Based on LinearResample::GetNumOutputSamples. + + LinearResample (LR) means that the output signal is at + linearly spaced intervals (i.e the output signal has a + frequency of ``new_freq``). It uses sinc/bandlimited + interpolation to upsample/downsample the signal. + + (almost directly from torchaudio.compliance.kaldi) + + Arguments + --------- + input_num_samp : int + The number of samples in each example in the batch. + + Returns + ------- + Number of samples in the output waveform. + """ + + # For exact computation, we measure time in "ticks" of 1.0 / tick_freq, + # where tick_freq is the least common multiple of samp_in and + # samp_out. + samp_in = int(self.orig_freq) + samp_out = int(self.new_freq) + + tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out) + ticks_per_input_period = tick_freq // samp_in + + # work out the number of ticks in the time interval + # [ 0, input_num_samp/samp_in ). + interval_length = input_num_samp * ticks_per_input_period + if interval_length <= 0: + return 0 + ticks_per_output_period = tick_freq // samp_out + + # Get the last output-sample in the closed interval, + # i.e. replacing [ ) with [ ]. Note: integer division rounds down. + # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an + # explanation of the notation. + last_output_samp = interval_length // ticks_per_output_period + + # We need the last output-sample in the open interval, so if it + # takes us to the end of the interval exactly, subtract one. + if last_output_samp * ticks_per_output_period == interval_length: + last_output_samp -= 1 + + # First output-sample index is zero, so the number of output samples + # is the last output-sample plus one. + num_output_samp = last_output_samp + 1 + + return num_output_samp + + def _indices_and_weights(self, waveforms): + """Based on LinearResample::SetIndexesAndWeights + + Retrieves the weights for resampling as well as the indices in which + they are valid. LinearResample (LR) means that the output signal is at + linearly spaced intervals (i.e the output signal has a frequency + of ``new_freq``). It uses sinc/bandlimited interpolation to + upsample/downsample the signal. + + Returns + ------- + - the place where each filter should start being applied + - the filters to be applied to the signal for resampling + """ + + # Lowpass filter frequency depends on smaller of two frequencies + min_freq = min(self.orig_freq, self.new_freq) + lowpass_cutoff = 0.99 * 0.5 * min_freq + + assert lowpass_cutoff * 2 <= min_freq + window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff) + + assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2 + output_t = paddle.arange( + start=0.0, end=self.output_samples + ) + output_t /= self.new_freq + min_t = output_t - window_width + max_t = output_t + window_width + + min_input_index = paddle.ceil(min_t * self.orig_freq) + max_input_index = paddle.floor(max_t * self.orig_freq) + num_indices = max_input_index - min_input_index + 1 + + max_weight_width = num_indices.max() + j = paddle.arange(max_weight_width) + input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0) + delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1) + + weights = paddle.zeros_like(delta_t) + + inside_window_indices = delta_t.abs() < (window_width) + # raised-cosine (Hanning) window with width `window_width` + weights[inside_window_indices] = 0.5 * ( + 1 + + paddle.cos( + 2 + * math.pi + * lowpass_cutoff + / self.lowpass_filter_width + * delta_t[inside_window_indices] + ) + ) + t_eq_zero_indices = delta_t == 0.0 + t_not_eq_zero_indices = ~t_eq_zero_indices + + # sinc filter function + weights[t_not_eq_zero_indices] *= paddle.sin( + 2 * math.pi * lowpass_cutoff * delta_t[t_not_eq_zero_indices] + ) / (math.pi * delta_t[t_not_eq_zero_indices]) + + # limit of the function at t = 0 + weights[t_eq_zero_indices] *= 2 * lowpass_cutoff + + # size (output_samples, max_weight_width) + weights /= self.orig_freq + + self.first_indices = min_input_index + self.weights = weights + + +class DropFreq(nn.Layer): + """This class drops a random frequency from the signal. + The purpose of this class is to teach models to learn to rely on all parts + of the signal, not just a few frequency bands. + Arguments + --------- + drop_freq_low : float + The low end of frequencies that can be dropped, + as a fraction of the sampling rate / 2. + drop_freq_high : float + The high end of frequencies that can be + dropped, as a fraction of the sampling rate / 2. + drop_count_low : int + The low end of number of frequencies that could be dropped. + drop_count_high : int + The high end of number of frequencies that could be dropped. + drop_width : float + The width of the frequency band to drop, as + a fraction of the sampling_rate / 2. + drop_prob : float + The probability that the batch of signals will have a frequency + dropped. By default, every batch has frequencies dropped. + Example + ------- + >>> from speechbrain.dataio.dataio import read_audio + >>> dropper = DropFreq() + >>> signal = read_audio('tests/samples/single-mic/example1.wav') + >>> dropped_signal = dropper(signal.unsqueeze(0)) + """ + + def __init__( + self, + drop_freq_low=1e-14, + drop_freq_high=1, + drop_count_low=1, + drop_count_high=2, + drop_width=0.05, + drop_prob=1, + ): + super().__init__() + self.drop_freq_low = drop_freq_low + self.drop_freq_high = drop_freq_high + self.drop_count_low = drop_count_low + self.drop_count_high = drop_count_high + self.drop_width = drop_width + self.drop_prob = drop_prob + + def forward(self, waveforms): + """ + Arguments + --------- + waveforms : tensor + Shape should be `[batch, time]` or `[batch, time, channels]`. + Returns + ------- + Tensor of shape `[batch, time]` or `[batch, time, channels]`. + """ + + # Don't drop (return early) 1-`drop_prob` portion of the batches + dropped_waveform = waveforms.clone() + if paddle.rand([1]) > self.drop_prob: + return dropped_waveform + + # Add channels dimension + if len(waveforms.shape) == 2: + dropped_waveform = dropped_waveform.unsqueeze(-1) + + # Pick number of frequencies to drop + drop_count = paddle.randint( + low=self.drop_count_low, high=self.drop_count_high + 1, shape=(1,), + ) + + # Pick a frequency to drop + drop_range = self.drop_freq_high - self.drop_freq_low + drop_frequency = ( + paddle.rand(drop_count) * drop_range + self.drop_freq_low + ) + + # Filter parameters + filter_length = 101 + pad = filter_length // 2 + + # Start with delta function + drop_filter = paddle.zeros([1, filter_length, 1]) + drop_filter[0, pad, 0] = 1 + # Subtract each frequency + for frequency in drop_frequency: + notch_kernel = notch_filter( + frequency, filter_length, self.drop_width, + ) + drop_filter = convolve1d(drop_filter, notch_kernel, pad) + + # Apply filter + dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad) + + # Remove channels dimension if added + return dropped_waveform.squeeze(-1) + +class DropChunk(nn.Layer): + """This class drops portions of the input signal. + Using `DropChunk` as an augmentation strategy helps a models learn to rely + on all parts of the signal, since it can't expect a given part to be + present. + Arguments + --------- + drop_length_low : int + The low end of lengths for which to set the + signal to zero, in samples. + drop_length_high : int + The high end of lengths for which to set the + signal to zero, in samples. + drop_count_low : int + The low end of number of times that the signal + can be dropped to zero. + drop_count_high : int + The high end of number of times that the signal + can be dropped to zero. + drop_start : int + The first index for which dropping will be allowed. + drop_end : int + The last index for which dropping will be allowed. + drop_prob : float + The probability that the batch of signals will + have a portion dropped. By default, every batch + has portions dropped. + noise_factor : float + The factor relative to average amplitude of an utterance + to use for scaling the white noise inserted. 1 keeps + the average amplitude the same, while 0 inserts all 0's. + Example + ------- + >>> from speechbrain.dataio.dataio import read_audio + >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.) + >>> signal = read_audio('tests/samples/single-mic/example1.wav') + >>> signal = signal.unsqueeze(0) # [batch, time, channels] + >>> length = torch.ones(1) + >>> dropped_signal = dropper(signal, length) + >>> float(dropped_signal[:, 150]) + 0.0 + """ + + def __init__( + self, + drop_length_low=100, + drop_length_high=1000, + drop_count_low=1, + drop_count_high=10, + drop_start=0, + drop_end=None, + drop_prob=1, + noise_factor=0.0, + ): + super().__init__() + self.drop_length_low = drop_length_low + self.drop_length_high = drop_length_high + self.drop_count_low = drop_count_low + self.drop_count_high = drop_count_high + self.drop_start = drop_start + self.drop_end = drop_end + self.drop_prob = drop_prob + self.noise_factor = noise_factor + + # Validate low < high + if drop_length_low > drop_length_high: + raise ValueError("Low limit must not be more than high limit") + if drop_count_low > drop_count_high: + raise ValueError("Low limit must not be more than high limit") + + # Make sure the length doesn't exceed end - start + if drop_end is not None and drop_end >= 0: + if drop_start > drop_end: + raise ValueError("Low limit must not be more than high limit") + + drop_range = drop_end - drop_start + self.drop_length_low = min(drop_length_low, drop_range) + self.drop_length_high = min(drop_length_high, drop_range) + + def forward(self, waveforms, lengths): + """ + Arguments + --------- + waveforms : tensor + Shape should be `[batch, time]` or `[batch, time, channels]`. + lengths : tensor + Shape should be a single dimension, `[batch]`. + Returns + ------- + Tensor of shape `[batch, time]` or + `[batch, time, channels]` + """ + + # Reading input list + lengths = (lengths * waveforms.shape[1]).long() + batch_size = waveforms.shape[0] + dropped_waveform = waveforms.clone() + + # Don't drop (return early) 1-`drop_prob` portion of the batches + if paddle.rand([1]) > self.drop_prob: + return dropped_waveform + + # Store original amplitude for computing white noise amplitude + clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1)) + + # Pick a number of times to drop + drop_times = paddle.randint( + low=self.drop_count_low, + high=self.drop_count_high + 1, + shape=(batch_size,), + ) + + # Iterate batch to set mask + for i in range(batch_size): + if drop_times[i] == 0: + continue + + # Pick lengths + length = paddle.randint( + low=self.drop_length_low, + high=self.drop_length_high + 1, + shape=(drop_times[i],), + ) + + # Compute range of starting locations + start_min = self.drop_start + if start_min < 0: + start_min += lengths[i] + start_max = self.drop_end + if start_max is None: + start_max = lengths[i] + if start_max < 0: + start_max += lengths[i] + start_max = max(0, start_max - length.max()) + + # Pick starting locations + start = paddle.randint( + low=start_min, high=start_max + 1, shape=(drop_times[i],), + ) + + end = start + length + + # Update waveform + if not self.noise_factor: + for j in range(drop_times[i]): + dropped_waveform[i, start[j] : end[j]] = 0.0 + else: + # Uniform distribution of -2 to +2 * avg amplitude should + # preserve the average for normalization + noise_max = 2 * clean_amplitude[i] * self.noise_factor + for j in range(drop_times[i]): + # zero-center the noise distribution + noise_vec = paddle.rand(length[j]) + noise_vec = 2 * noise_max * noise_vec - noise_max + dropped_waveform[i, start[j] : end[j]] = noise_vec + + return dropped_waveform + + +class TimeDomainSpecAugment(nn.Layer): + """A time-domain approximation of the SpecAugment algorithm. + + This augmentation module implements three augmentations in + the time-domain. + + 1. Drop chunks of the audio (zero amplitude or white noise) + 2. Drop frequency bands (with band-drop filters) + 3. Speed peturbation (via resampling to slightly different rate) + + Arguments + --------- + perturb_prob : float from 0 to 1 + The probability that a batch will have speed perturbation applied. + drop_freq_prob : float from 0 to 1 + The probability that a batch will have frequencies dropped. + drop_chunk_prob : float from 0 to 1 + The probability that a batch will have chunks dropped. + speeds : list of ints + A set of different speeds to use to perturb each batch. + See ``speechbrain.processing.speech_augmentation.SpeedPerturb`` + sample_rate : int + Sampling rate of the input waveforms. + drop_freq_count_low : int + Lowest number of frequencies that could be dropped. + drop_freq_count_high : int + Highest number of frequencies that could be dropped. + drop_chunk_count_low : int + Lowest number of chunks that could be dropped. + drop_chunk_count_high : int + Highest number of chunks that could be dropped. + drop_chunk_length_low : int + Lowest length of chunks that could be dropped. + drop_chunk_length_high : int + Highest length of chunks that could be dropped. + drop_chunk_noise_factor : float + The noise factor used to scale the white noise inserted, relative to + the average amplitude of the utterance. Default 0 (no noise inserted). + + Example + ------- + >>> inputs = torch.randn([10, 16000]) + >>> feature_maker = TimeDomainSpecAugment(speeds=[80]) + >>> feats = feature_maker(inputs, torch.ones(10)) + >>> feats.shape + torch.Size([10, 12800]) + """ + + def __init__( + self, + perturb_prob=1.0, + drop_freq_prob=1.0, + drop_chunk_prob=1.0, + speeds=[95, 100, 105], + sample_rate=16000, + drop_freq_count_low=0, + drop_freq_count_high=3, + drop_chunk_count_low=0, + drop_chunk_count_high=5, + drop_chunk_length_low=1000, + drop_chunk_length_high=2000, + drop_chunk_noise_factor=0, + ): + super().__init__() + self.speed_perturb = SpeedPerturb( + perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds + ) + self.drop_freq = DropFreq( + drop_prob=drop_freq_prob, + drop_count_low=drop_freq_count_low, + drop_count_high=drop_freq_count_high, + ) + self.drop_chunk = DropChunk( + drop_prob=drop_chunk_prob, + drop_count_low=drop_chunk_count_low, + drop_count_high=drop_chunk_count_high, + drop_length_low=drop_chunk_length_low, + drop_length_high=drop_chunk_length_high, + noise_factor=drop_chunk_noise_factor, + ) + + def forward(self, waveforms, lengths): + """Returns the distorted waveforms. + + Arguments + --------- + waveforms : torch.Tensor + The waveforms to distort + """ + # Augmentation + with paddle.no_grad(): + waveforms = self.speed_perturb(waveforms) + waveforms = self.drop_freq(waveforms) + waveforms = self.drop_chunk(waveforms, lengths) + return waveforms \ No newline at end of file diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py new file mode 100644 index 000000000..da243342c --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py @@ -0,0 +1,14 @@ +import paddle +import numpy as np + +def blackman_window(window_length, periodic=True): + if window_length == 0: + return [] + if window_length == 1: + return paddle.ones([1]) + if periodic: + window_length += 1 + + window = paddle.arange(window_length) * (np.pi / (window_length - 1)) + window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42 + return window[:-1] if periodic else window diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py new file mode 100644 index 000000000..e20a7e129 --- /dev/null +++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py @@ -0,0 +1,287 @@ +import numpy as np +import os + +from typing import Dict +from typing import List +from typing import Optional +from typing import Tuple + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddlespeech.s2t.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ConfigPure +from paddlespeech.s2t.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model +from paddlespeech.s2t.modules.mask import make_pad_mask +from paddlespeech.s2t.utils.utility import log_add + +from collections import defaultdict + +from paddlespeech.s2t.models.wav2vec2.speechbrain.lobes.models.VanillaNN import VanillaNN +from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC +from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank +from yacs.config import CfgNode + +class Wav2vec2ASR(nn.Layer): + def __init__(self, config: dict): + super().__init__() + + wav2vec2_config = Wav2Vec2ConfigPure() + wav2vec2 = Wav2Vec2Model(wav2vec2_config) + + model_dict = paddle.load(config.wav2vec2_params_path) + wav2vec2.set_state_dict(model_dict) + wav2vec2.eval() + self.normalize_wav = config.normalize_wav + self.output_norm = config.output_norm + if config.freeze_wav2vec2: + for parm in wav2vec2.parameters(): + parm.trainable = False + self.wav2vec2 = wav2vec2 + self.enc = VanillaNN(input_shape=[None,None,wav2vec2_config.hidden_size], activation=nn.LeakyReLU, dnn_blocks=config.dnn_blocks, dnn_neurons=config.dnn_neurons) + self.ctc = CTC(odim=config.output_dim, enc_n_units=config.dnn_neurons, blank_id=config.blank_id, dropout_rate=config.ctc_dropout_rate, reduction_type="mean") + + def train_batch(self): + wav, wavs_lens_rate, target, target_lens_rate = self._get_data() + ctc_loss = self(wav, wavs_lens_rate, target, target_lens_rate) + + + def forward(self, wav, wavs_lens_rate, target, target_lens_rate): + if self.normalize_wav: + wav = F.layer_norm(wav, wav.shape[1:]) + # Extract wav2vec output + out = self.wav2vec2(wav)[0] + np.save("data/out.npy", out.numpy()) + # We normalize the output if required + if self.output_norm: + out = F.layer_norm(out, out.shape[1:]) + feats = out + + x = self.enc(feats) + x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64) + target_lens = (target_lens_rate * target.shape[1]).round().astype(paddle.int64) + + ctc_loss = self.ctc(x, x_lens, target, target_lens) + return ctc_loss + + + @paddle.no_grad() + def decode(self, + feats: paddle.Tensor, + feats_lengths: paddle.Tensor, + text_feature: Dict[str, int], + decoding_method: str, + beam_size: int): + batch_size = feats.shape[0] + if decoding_method is 'ctc_prefix_beam_search' and batch_size > 1: + logger.error( + f'decoding mode {decoding_method} must be running with batch_size == 1' + ) + logger.error(f"current batch_size is {batch_size}") + sys.exit(1) + + if decoding_method == 'ctc_greedy_search': + hyps = self.ctc_greedy_search(feats, feats_lengths) + res = [text_feature.defeaturize(hyp) for hyp in hyps] + res_tokenids = [hyp for hyp in hyps] + # ctc_prefix_beam_search and attention_rescoring only return one + # result in List[int], change it to List[List[int]] for compatible + # with other batch decoding mode + elif decoding_method == 'ctc_prefix_beam_search': + assert feats.shape[0] == 1 + hyp = self.ctc_prefix_beam_search( + feats, + feats_lengths, + beam_size) + res = [text_feature.defeaturize(hyp)] + res_tokenids = [hyp] + else: + raise ValueError(f"wav2vec2 not support decoding method: {decoding_method}") + + return res, res_tokenids + + @classmethod + def from_config(cls, config): + model = cls(config) + return model + + def ctc_greedy_search( + self, wav, wavs_lens_rate) -> List[List[int]]: + """ Apply CTC greedy search + Args: + speech (paddle.Tensor): (batch, max_len) + speech_length (paddle.Tensor): (batch, ) + Returns: + List[List[int]]: best path result + """ + batch_size = wav.shape[0] + wav = wav[:,:,0] + if self.normalize_wav: + wav = F.layer_norm(wav, wav.shape[1:]) + # Extract wav2vec output + out = self.wav2vec2(wav)[0] + # We normalize the output if required + if self.output_norm: + out = F.layer_norm(out, out.shape[1:]) + feats = out + x = self.enc(feats) + x_lens = x.shape[1] + ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size) + topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1) + topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen) + # pad_mask = make_pad_mask(x_lens) # (B, maxlen) + # topk_index = topk_index.masked_fill_(pad_mask, self.eos) # (B, maxlen) + + hyps = [hyp.tolist() for hyp in topk_index] + hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps] + return hyps + + def _ctc_prefix_beam_search( + self, wav, wavs_lens_rate, beam_size, blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]: + """ CTC prefix beam search inner implementation + Args: + speech (paddle.Tensor): (batch, max_len, feat_dim) + speech_length (paddle.Tensor): (batch, ) + beam_size (int): beam size for beam search + decoding_chunk_size (int): decoding chunk for dynamic chunk + trained model. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here + simulate_streaming (bool): whether do encoder forward in a + streaming fashion + Returns: + List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood) + paddle.Tensor: encoder output, (1, max_len, encoder_dim), + it will be used for rescoring in attention rescoring mode + """ + wav = wav[:,:,0] + + if self.normalize_wav: + wav = F.layer_norm(wav, wav.shape[1:]) + # Extract wav2vec output + out = self.wav2vec2(wav)[0] + # We normalize the output if required + if self.output_norm: + out = F.layer_norm(out, out.shape[1:]) + feats = out + + x = self.enc(feats) + maxlen = x.shape[1] + ctc_probs = self.ctc.log_softmax(x) # (1, maxlen, vocab_size) + ctc_probs = ctc_probs.squeeze(0) + + # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) + # blank_ending_score and none_blank_ending_score in ln domain + cur_hyps = [(tuple(), (0.0, -float('inf')))] + # 2. CTC beam search step by step + for t in range(0, maxlen): + logp = ctc_probs[t] # (vocab_size,) + # key: prefix, value (pb, pnb), default value(-inf, -inf) + next_hyps = defaultdict(lambda: (-float('inf'), -float('inf'))) + # 2.1 First beam prune: select topk best + top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,) + for s in top_k_index: + s = s.item() + ps = logp[s].item() + for prefix, (pb, pnb) in cur_hyps: + last = prefix[-1] if len(prefix) > 0 else None + if s == blank_id: # blank + n_pb, n_pnb = next_hyps[prefix] + n_pb = log_add([n_pb, pb + ps, pnb + ps]) + next_hyps[prefix] = (n_pb, n_pnb) + elif s == last: + # Update *ss -> *s; + n_pb, n_pnb = next_hyps[prefix] + n_pnb = log_add([n_pnb, pnb + ps]) + next_hyps[prefix] = (n_pb, n_pnb) + # Update *s-s -> *ss, - is for blank + n_prefix = prefix + (s, ) + n_pb, n_pnb = next_hyps[n_prefix] + n_pnb = log_add([n_pnb, pb + ps]) + next_hyps[n_prefix] = (n_pb, n_pnb) + else: + n_prefix = prefix + (s, ) + n_pb, n_pnb = next_hyps[n_prefix] + n_pnb = log_add([n_pnb, pb + ps, pnb + ps]) + next_hyps[n_prefix] = (n_pb, n_pnb) + + # 2.2 Second beam prune + next_hyps = sorted( + next_hyps.items(), + key=lambda x: log_add(list(x[1])), + reverse=True) + cur_hyps = next_hyps[:beam_size] + + hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps] + return hyps + + def ctc_prefix_beam_search(self, wav, wavs_lens_rate, beam_size) -> List[int]: + """ Apply CTC prefix beam search + Args: + speech (paddle.Tensor): (batch, max_len, feat_dim) + speech_length (paddle.Tensor): (batch, ) + beam_size (int): beam size for beam search + decoding_chunk_size (int): decoding chunk for dynamic chunk + trained model. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + 0: used for training, it's prohibited here + simulate_streaming (bool): whether do encoder forward in a + streaming fashion + Returns: + List[int]: CTC prefix beam search nbest results + """ + hyps = self._ctc_prefix_beam_search( + wav, wavs_lens_rate, beam_size) + return hyps[0][0] + + # @jit.to_static + def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: + """ Export interface for c++ call, apply linear transform and log + softmax before ctc + Args: + xs (paddle.Tensor): encoder output, (B, T, D) + Returns: + paddle.Tensor: activation before ctc + """ + return self.ctc.log_softmax(xs) + + + def _get_data(self): + data_dir = "data" + wavs = np.load(os.path.join(data_dir, "wavs.npy")) + wavs_lens = np.load(os.path.join(data_dir, "wavs_lens.npy")) + tokens = np.load(os.path.join(data_dir, "tokens.npy")) + tokens_lens = np.load(os.path.join(data_dir, "tokens_lens.npy")) + + batch = (paddle.to_tensor(wavs), paddle.to_tensor(wavs_lens, dtype='float32'), + paddle.to_tensor(tokens, dtype='int32'), paddle.to_tensor(tokens_lens, dtype='float32')) + return batch + + +if __name__ == "__main__": + # wav2vec2_asr = Wav2vec2ASR(config={}) + # wav2vec2_asr.train_batch() + freeze = True + config = Wav2Vec2ConfigPure() + model = Wav2Vec2Model(config) + model_dict = model.state_dict() + revise_params_path = "exp/torch_to_paddle_revise.pdparams" + model_dict_revise = paddle.load(revise_params_path) + model.set_state_dict(model_dict_revise) + model.training = True + model.eval() + if freeze: + for parm in model.parameters(): + parm.requires_grad = False + # get enc() + enc = VanillaNN(input_shape=[None,None,1024], activation=paddle.nn.LeakyReLU, dnn_blocks=2, dnn_neurons=1024) + + ctc = CTC(odim=30, enc_n_units=1024, blank_id=0, dropout_rate=0.0) + + input_values = np.load("input_values.npy") + input_values = paddle.to_tensor(input_values) + + feats = model(input_values).last_hidden_state + x = enc(feats) + ctc_loss = ctc(enc, target) diff --git a/paddlespeech/s2t/modules/align.py b/paddlespeech/s2t/modules/align.py index 34d796145..cacda2461 100644 --- a/paddlespeech/s2t/modules/align.py +++ b/paddlespeech/s2t/modules/align.py @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import math - import paddle from paddle import nn +import math """ To align the initializer between paddle and torch, the API below are set defalut initializer with priority higger than global initializer. @@ -82,18 +81,10 @@ class Linear(nn.Linear): name=None): if weight_attr is None: if global_init_type == "kaiming_uniform": - weight_attr = paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform( - fan_in=None, - negative_slope=math.sqrt(5), - nonlinearity='leaky_relu')) + weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu')) if bias_attr is None: if global_init_type == "kaiming_uniform": - bias_attr = paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform( - fan_in=None, - negative_slope=math.sqrt(5), - nonlinearity='leaky_relu')) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu')) super(Linear, self).__init__(in_features, out_features, weight_attr, bias_attr, name) @@ -113,18 +104,10 @@ class Conv1D(nn.Conv1D): data_format='NCL'): if weight_attr is None: if global_init_type == "kaiming_uniform": - weight_attr = paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform( - fan_in=None, - negative_slope=math.sqrt(5), - nonlinearity='leaky_relu')) + weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu')) if bias_attr is None: if global_init_type == "kaiming_uniform": - bias_attr = paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform( - fan_in=None, - negative_slope=math.sqrt(5), - nonlinearity='leaky_relu')) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu')) super(Conv1D, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, groups, padding_mode, weight_attr, bias_attr, data_format) @@ -145,18 +128,10 @@ class Conv2D(nn.Conv2D): data_format='NCHW'): if weight_attr is None: if global_init_type == "kaiming_uniform": - weight_attr = paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform( - fan_in=None, - negative_slope=math.sqrt(5), - nonlinearity='leaky_relu')) + weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu')) if bias_attr is None: if global_init_type == "kaiming_uniform": - bias_attr = paddle.ParamAttr( - initializer=nn.initializer.KaimingUniform( - fan_in=None, - negative_slope=math.sqrt(5), - nonlinearity='leaky_relu')) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu')) super(Conv2D, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, groups, padding_mode, weight_attr, bias_attr, data_format) diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py index 92990048d..b6d615867 100644 --- a/paddlespeech/s2t/modules/attention.py +++ b/paddlespeech/s2t/modules/attention.py @@ -15,6 +15,7 @@ # Modified from wenet(https://github.com/wenet-e2e/wenet) """Multi-Head Attention layer definition.""" import math +from typing import Optional from typing import Tuple import paddle @@ -82,12 +83,11 @@ class MultiHeadedAttention(nn.Layer): return q, k, v - def forward_attention( - self, - value: paddle.Tensor, + def forward_attention(self, + value: paddle.Tensor, scores: paddle.Tensor, - mask: paddle.Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) - ) -> paddle.Tensor: + mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool), + ) -> paddle.Tensor: """Compute attention context vector. Args: value (paddle.Tensor): Transformed value, size @@ -108,7 +108,7 @@ class MultiHeadedAttention(nn.Layer): # When will `if mask.size(2) > 0` be False? # 1. onnx(16/-1, -1/-1, 16/0) # 2. jit (16/-1, -1/-1, 16/0, 16/4) - if paddle.shape(mask)[2] > 0: # time2 > 0 + if paddle.shape(mask)[2] > 0: # time2 > 0 mask = mask.unsqueeze(1).equal(0) # (batch, 1, *, time2) # for last chunk, time2 might be larger than scores.size(-1) mask = mask[:, :, :, :paddle.shape(scores)[-1]] @@ -127,15 +127,14 @@ class MultiHeadedAttention(nn.Layer): return self.linear_out(x) # (batch, time1, d_model) - def forward( - self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) - pos_emb: paddle.Tensor, # paddle.empty([0]) - cache: paddle.Tensor # paddle.zeros([0,0,0,0]) - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward(self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), + pos_emb: paddle.Tensor = paddle.empty([0]), + cache: paddle.Tensor = paddle.zeros([0,0,0,0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute scaled dot product attention. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). @@ -244,15 +243,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): return x - def forward( - self, - query: paddle.Tensor, - key: paddle.Tensor, - value: paddle.Tensor, - mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) - pos_emb: paddle.Tensor, # paddle.empty([0]) - cache: paddle.Tensor # paddle.zeros([0,0,0,0]) - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward(self, + query: paddle.Tensor, + key: paddle.Tensor, + value: paddle.Tensor, + mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), + pos_emb: paddle.Tensor = paddle.empty([0]), + cache: paddle.Tensor = paddle.zeros([0,0,0,0]) + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute 'Scaled Dot Product Attention' with rel. positional encoding. Args: query (paddle.Tensor): Query tensor (#batch, time1, size). diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py index b35fea5b9..c384b9c78 100644 --- a/paddlespeech/s2t/modules/conformer_convolution.py +++ b/paddlespeech/s2t/modules/conformer_convolution.py @@ -14,6 +14,7 @@ # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) """ConvolutionModule definition.""" +from typing import Optional from typing import Tuple import paddle @@ -105,12 +106,11 @@ class ConvolutionModule(nn.Layer): ) self.activation = activation - def forward( - self, - x: paddle.Tensor, - mask_pad: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) - cache: paddle.Tensor # paddle.zeros([0,0,0,0]) - ) -> Tuple[paddle.Tensor, paddle.Tensor]: + def forward(self, + x: paddle.Tensor, + mask_pad: paddle.Tensor= paddle.ones([0,0,0], dtype=paddle.bool), + cache: paddle.Tensor= paddle.zeros([0,0,0]), + ) -> Tuple[paddle.Tensor, paddle.Tensor]: """Compute convolution module. Args: x (paddle.Tensor): Input tensor (#batch, time, channels). @@ -127,11 +127,11 @@ class ConvolutionModule(nn.Layer): x = x.transpose([0, 2, 1]) # [B, C, T] # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if paddle.shape(mask_pad)[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) if self.lorder > 0: - if paddle.shape(cache)[2] == 0: # cache_t == 0 + if paddle.shape(cache)[2] == 0: # cache_t == 0 x = nn.functional.pad( x, [self.lorder, 0], 'constant', 0.0, data_format='NCL') else: @@ -161,7 +161,7 @@ class ConvolutionModule(nn.Layer): x = self.pointwise_conv2(x) # mask batch padding - if paddle.shape(mask_pad)[2] > 0: # time > 0 + if paddle.shape(mask_pad)[2] > 0: # time > 0 x = x.masked_fill(mask_pad, 0.0) x = x.transpose([0, 2, 1]) # [B, T, C] diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py index 0f50db21d..652660e16 100644 --- a/paddlespeech/s2t/modules/ctc.py +++ b/paddlespeech/s2t/modules/ctc.py @@ -53,7 +53,7 @@ class CTCDecoderBase(nn.Layer): enc_n_units, blank_id=0, dropout_rate: float=0.0, - reduction: bool=True, + reduction_type: str="sum", batch_average: bool=True, grad_norm_type: Union[str, None]=None): """CTC decoder @@ -73,7 +73,7 @@ class CTCDecoderBase(nn.Layer): self.odim = odim self.dropout = nn.Dropout(dropout_rate) self.ctc_lo = Linear(enc_n_units, self.odim) - reduction_type = "sum" if reduction else "none" + reduction_type = reduction_type if reduction_type else "none" self.criterion = CTCLoss( blank=self.blank_id, reduction=reduction_type, diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py index c8843b723..37b124e84 100644 --- a/paddlespeech/s2t/modules/decoder_layer.py +++ b/paddlespeech/s2t/modules/decoder_layer.py @@ -121,16 +121,11 @@ class DecoderLayer(nn.Layer): if self.concat_after: tgt_concat = paddle.cat( - (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, - paddle.empty([0]), - paddle.zeros([0, 0, 0, 0]))[0]), - dim=-1) + (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1) x = residual + self.concat_linear1(tgt_concat) else: x = residual + self.dropout( - self.self_attn(tgt_q, tgt, tgt, tgt_q_mask, - paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[ - 0]) + self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]) if not self.normalize_before: x = self.norm1(x) @@ -139,15 +134,11 @@ class DecoderLayer(nn.Layer): x = self.norm2(x) if self.concat_after: x_concat = paddle.cat( - (x, self.src_attn(x, memory, memory, memory_mask, - paddle.empty([0]), - paddle.zeros([0, 0, 0, 0]))[0]), - dim=-1) + (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1) x = residual + self.concat_linear2(x_concat) else: x = residual + self.dropout( - self.src_attn(x, memory, memory, memory_mask, - paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0]) + self.src_attn(x, memory, memory, memory_mask)[0]) if not self.normalize_before: x = self.norm2(x) diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index cf4e32fa4..bff2d69bb 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -14,6 +14,8 @@ # limitations under the License. # Modified from wenet(https://github.com/wenet-e2e/wenet) """Encoder definition.""" +from typing import List +from typing import Optional from typing import Tuple import paddle @@ -175,9 +177,7 @@ class BaseEncoder(nn.Layer): decoding_chunk_size, self.static_chunk_size, num_decoding_left_chunks) for layer in self.encoders: - xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad, - paddle.zeros([0, 0, 0, 0]), - paddle.zeros([0, 0, 0, 0])) + xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad) if self.normalize_before: xs = self.after_norm(xs) # Here we assume the mask is not changed in encoder layers, so just @@ -190,9 +190,9 @@ class BaseEncoder(nn.Layer): xs: paddle.Tensor, offset: int, required_cache_size: int, - att_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]) - cnn_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]), - att_mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool) + att_cache: paddle.Tensor = paddle.zeros([0,0,0,0]), + cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0]), + att_mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Forward just one chunk Args: @@ -227,7 +227,7 @@ class BaseEncoder(nn.Layer): xs = self.global_cmvn(xs) # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D) - xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) + xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset) # after embed, xs=(B=1, chunk_size, hidden-dim) elayers = paddle.shape(att_cache)[0] @@ -252,17 +252,14 @@ class BaseEncoder(nn.Layer): # att_cache[i:i+1] = (1, head, cache_t1, d_k*2) # cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2) xs, _, new_att_cache, new_cnn_cache = layer( - xs, - att_mask, - pos_emb, - mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool), - att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache, - cnn_cache=cnn_cache[i:i + 1] - if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, ) + xs, att_mask, pos_emb, + att_cache=att_cache[i:i+1] if elayers > 0 else att_cache, + cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, + ) # new_att_cache = (1, head, attention_key_size, d_k*2) # new_cnn_cache = (B=1, hidden-dim, cache_t2) - r_att_cache.append(new_att_cache[:, :, next_cache_start:, :]) - r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim + r_att_cache.append(new_att_cache[:,:, next_cache_start:, :]) + r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim if self.normalize_before: xs = self.after_norm(xs) @@ -273,6 +270,7 @@ class BaseEncoder(nn.Layer): r_cnn_cache = paddle.concat(r_cnn_cache, axis=0) return xs, r_att_cache, r_cnn_cache + def forward_chunk_by_chunk( self, xs: paddle.Tensor, @@ -317,8 +315,8 @@ class BaseEncoder(nn.Layer): num_frames = xs.shape[1] required_cache_size = decoding_chunk_size * num_decoding_left_chunks - att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]) - cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]) + att_cache: paddle.Tensor = paddle.zeros([0,0,0,0]) + cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0]) outputs = [] offset = 0 @@ -328,8 +326,7 @@ class BaseEncoder(nn.Layer): chunk_xs = xs[:, cur:end, :] (y, att_cache, cnn_cache) = self.forward_chunk( - chunk_xs, offset, required_cache_size, att_cache, cnn_cache, - paddle.ones([0, 0, 0], dtype=paddle.bool)) + chunk_xs, offset, required_cache_size, att_cache, cnn_cache) outputs.append(y) offset += y.shape[1] diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py index 4555b535f..5f810dfde 100644 --- a/paddlespeech/s2t/modules/encoder_layer.py +++ b/paddlespeech/s2t/modules/encoder_layer.py @@ -76,10 +76,9 @@ class TransformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle. - Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool) - att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) - cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: @@ -106,8 +105,7 @@ class TransformerEncoderLayer(nn.Layer): if self.normalize_before: x = self.norm1(x) - x_att, new_att_cache = self.self_attn( - x, x, x, mask, paddle.empty([0]), cache=att_cache) + x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache) if self.concat_after: x_concat = paddle.concat((x, x_att), axis=-1) @@ -195,9 +193,9 @@ class ConformerEncoderLayer(nn.Layer): x: paddle.Tensor, mask: paddle.Tensor, pos_emb: paddle.Tensor, - mask_pad: paddle.Tensor, #paddle.ones([0, 0, 0],dtype=paddle.bool) - att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) - cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0]) + mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool), + att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), + cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]), ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """Compute encoded features. Args: diff --git a/paddlespeech/s2t/modules/initializer.py b/paddlespeech/s2t/modules/initializer.py index 6eae5713e..cdcf2e052 100644 --- a/paddlespeech/s2t/modules/initializer.py +++ b/paddlespeech/s2t/modules/initializer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import numpy as np class DefaultInitializerContext(object): """ diff --git a/paddlespeech/s2t/training/optimizer.py b/paddlespeech/s2t/training/optimizer.py index f7f70c570..75d3f5f5c 100644 --- a/paddlespeech/s2t/training/optimizer.py +++ b/paddlespeech/s2t/training/optimizer.py @@ -103,6 +103,8 @@ class OptimizerFactory(): grad_clip = ClipGradByGlobalNormWithLog( args['grad_clip']) if "grad_clip" in args else None + # grad_clip = paddle.nn.ClipGradByGlobalNorm( + # args['grad_clip']) if "grad_clip" in args else None weight_decay = L2Decay( args['weight_decay']) if "weight_decay" in args else None if weight_decay: diff --git a/paddlespeech/s2t/training/scheduler.py b/paddlespeech/s2t/training/scheduler.py index b22f7ef85..3464e2299 100644 --- a/paddlespeech/s2t/training/scheduler.py +++ b/paddlespeech/s2t/training/scheduler.py @@ -106,6 +106,59 @@ class ConstantLR(LRScheduler): def get_lr(self): return self.base_lr +@register_scheduler +class NewBobScheduler(LRScheduler): + """ + Args: + learning_rate (float): The initial learning rate. It is a python float number. + last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate. + verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` . + + Returns: + ``ConstantLR`` instance to schedule learning rate. + """ + def __init__( + self, + learning_rate, + annealing_factor=0.5, + improvement_threshold=0.0025, + patient=0, + ): + self.hyperparam_value = learning_rate + self.annealing_factor = annealing_factor + self.improvement_threshold = improvement_threshold + self.patient = patient + self.metric_values = [] + self.current_patient = self.patient + + def __call__(self, metric_value): + """Returns the current and new value for the hyperparameter. + + Arguments + --------- + metric_value : int + A number for determining whether to change the hyperparameter value. + """ + old_value = new_value = self.hyperparam_value + if len(self.metric_values) > 0: + prev_metric = self.metric_values[-1] + # Update value if improvement too small and patience is 0 + if prev_metric == 0: # Prevent division by zero + improvement = 0 + else: + improvement = (prev_metric - metric_value) / prev_metric + if improvement < self.improvement_threshold: + if self.current_patient == 0: + new_value *= self.annealing_factor + self.current_patient = self.patient + else: + self.current_patient -= 1 + # Store relevant info + self.metric_values.append(metric_value) + self.hyperparam_value = new_value + + return old_value, new_value + def dynamic_import_scheduler(module): """Import Scheduler class dynamically. diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py index a7eb9892d..815b61e0f 100644 --- a/paddlespeech/s2t/training/trainer.py +++ b/paddlespeech/s2t/training/trainer.py @@ -19,6 +19,8 @@ from pathlib import Path import paddle from paddle import distributed as dist +dist.init_parallel_env() + from visualdl import LogWriter from paddlespeech.s2t.training.reporter import ObsScope @@ -130,7 +132,9 @@ class Trainer(): latest_n=self.config.checkpoint.latest_n) # set random seed if needed + print(args.seed) if args.seed: + print('***********') seed_all(args.seed) logger.info(f"Set seed {args.seed}") @@ -176,7 +180,7 @@ class Trainer(): def init_parallel(self): """Init environment for multiprocess training. """ - dist.init_parallel_env() + # dist.init_parallel_env() @mp_tools.rank_zero_only def save(self, tag=None, infos: dict=None): diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml index 55f241ec7..8650154e9 100644 --- a/paddlespeech/server/conf/application.yaml +++ b/paddlespeech/server/conf/application.yaml @@ -25,7 +25,6 @@ asr_python: cfg_path: # [optional] ckpt_path: # [optional] decode_method: 'attention_rescoring' - num_decoding_left_chunks: -1 force_yes: True device: # set 'gpu:id' or 'cpu' @@ -39,7 +38,6 @@ asr_inference: lang: 'zh' sample_rate: 16000 cfg_path: - num_decoding_left_chunks: -1 decode_method: force_yes: True diff --git a/paddlespeech/server/engine/asr/online/ctc_endpoint.py b/paddlespeech/server/engine/asr/online/ctc_endpoint.py index 1b8ad1cb7..b87dbe805 100644 --- a/paddlespeech/server/engine/asr/online/ctc_endpoint.py +++ b/paddlespeech/server/engine/asr/online/ctc_endpoint.py @@ -102,10 +102,8 @@ class OnlineCTCEndpoint: assert self.num_frames_decoded >= self.trailing_silence_frames assert self.frame_shift_in_ms > 0 - - decoding_something = ( - self.num_frames_decoded > self.trailing_silence_frames - ) and decoding_something + + decoding_something = (self.num_frames_decoded > self.trailing_silence_frames) and decoding_something utterance_length = self.num_frames_decoded * self.frame_shift_in_ms trailing_silence = self.trailing_silence_frames * self.frame_shift_in_ms diff --git a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py index 6daae5be3..ab4f11305 100644 --- a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py @@ -21,12 +21,12 @@ import paddle from numpy import float32 from yacs.config import CfgNode -from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.modules.ctc import CTCDecoder +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils import onnx_infer diff --git a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py index 0fd5d1bc6..182e64180 100644 --- a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py @@ -21,10 +21,10 @@ import paddle from numpy import float32 from yacs.config import CfgNode -from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.s2t.utils.utility import UpdateConfig diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py index 87d88ee60..4df38f09d 100644 --- a/paddlespeech/server/engine/asr/online/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py @@ -21,10 +21,10 @@ import paddle from numpy import float32 from yacs.config import CfgNode -from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.cli.asr.infer import ASRExecutor from paddlespeech.cli.log import logger from paddlespeech.resource import CommonTaskResource +from paddlespeech.audio.transform.transformation import Transformation from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.modules.ctc import CTCDecoder from paddlespeech.s2t.utils.tensor_utils import add_sos_eos @@ -130,8 +130,8 @@ class PaddleASRConnectionHanddler: ## conformer # cache for conformer online - self.att_cache = paddle.zeros([0, 0, 0, 0]) - self.cnn_cache = paddle.zeros([0, 0, 0, 0]) + self.att_cache = paddle.zeros([0,0,0,0]) + self.cnn_cache = paddle.zeros([0,0,0,0]) self.encoder_out = None # conformer decoding state @@ -474,14 +474,9 @@ class PaddleASRConnectionHanddler: # cur chunk chunk_xs = self.cached_feat[:, cur:end, :] # forward chunk - (y, self.att_cache, - self.cnn_cache) = self.model.encoder.forward_chunk( - chunk_xs, - self.offset, - required_cache_size, - att_cache=self.att_cache, - cnn_cache=self.cnn_cache, - att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool)) + (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk( + chunk_xs, self.offset, required_cache_size, + self.att_cache, self.cnn_cache) outputs.append(y) # update the global offset, in decoding frame unit diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index e297e5c21..02c40fd12 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -68,12 +68,9 @@ class ASREngine(BaseEngine): return False self.executor._init_from_path( - model_type=self.config.model, - lang=self.config.lang, - sample_rate=self.config.sample_rate, - cfg_path=self.config.cfg_path, - decode_method=self.config.decode_method, - ckpt_path=self.config.ckpt_path) + self.config.model, self.config.lang, self.config.sample_rate, + self.config.cfg_path, self.config.decode_method, + self.config.ckpt_path) logger.info("Initialize ASR server engine successfully on device: %s." % (self.device)) diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py index 7b8f667db..f7d60648d 100644 --- a/paddlespeech/server/engine/vector/python/vector_engine.py +++ b/paddlespeech/server/engine/vector/python/vector_engine.py @@ -105,8 +105,7 @@ class PaddleVectorConnectionHandler: # we can not reuse the cache io.BytesIO(audio) data, # because the soundfile will change the io.BytesIO(audio) to the end # thus we should convert the base64 string to io.BytesIO when we need the audio data - if not self.executor._check( - io.BytesIO(audio), sample_rate, force_yes=True): + if not self.executor._check(io.BytesIO(audio), sample_rate): logger.debug("check the audio sample rate occurs error") return np.array([0.0]) diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py index c00648b1f..2cb7a11a2 100644 --- a/paddlespeech/t2s/datasets/am_batch_fn.py +++ b/paddlespeech/t2s/datasets/am_batch_fn.py @@ -11,12 +11,19 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Collection +from typing import Dict +from typing import List +from typing import Tuple + import numpy as np import paddle from paddlespeech.t2s.datasets.batch import batch_sequences +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.modules.nets_utils import get_seg_pos from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask +from paddlespeech.t2s.modules.nets_utils import pad_list from paddlespeech.t2s.modules.nets_utils import phones_masking from paddlespeech.t2s.modules.nets_utils import phones_text_masking @@ -485,56 +492,180 @@ def vits_single_spk_batch_fn(examples): return batch -def vits_multi_spk_batch_fn(examples): - """ - Returns: - Dict[str, Any]: - - text (Tensor): Text index tensor (B, T_text). - - text_lengths (Tensor): Text length tensor (B,). - - feats (Tensor): Feature tensor (B, T_feats, aux_channels). - - feats_lengths (Tensor): Feature length tensor (B,). - - speech (Tensor): Speech waveform tensor (B, T_wav). - - spk_id (Optional[Tensor]): Speaker index tensor (B,) or (B, 1). - - spk_emb (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim). - """ - # fields = ["text", "text_lengths", "feats", "feats_lengths", "speech", "spk_id"/"spk_emb"] - text = [np.array(item["text"], dtype=np.int64) for item in examples] - feats = [np.array(item["feats"], dtype=np.float32) for item in examples] - speech = [np.array(item["wave"], dtype=np.float32) for item in examples] - text_lengths = [ - np.array(item["text_lengths"], dtype=np.int64) for item in examples - ] - feats_lengths = [ - np.array(item["feats_lengths"], dtype=np.int64) for item in examples - ] +# for ERNIE SAT +class MLMCollateFn: + """Functor class of common_collate_fn()""" - text = batch_sequences(text) - feats = batch_sequences(feats) - speech = batch_sequences(speech) + def __init__( + self, + feats_extract, + mlm_prob: float=0.8, + mean_phn_span: int=8, + seg_emb: bool=False, + text_masking: bool=False, + attention_window: int=0, + not_sequence: Collection[str]=(), ): + self.mlm_prob = mlm_prob + self.mean_phn_span = mean_phn_span + self.feats_extract = feats_extract + self.not_sequence = set(not_sequence) + self.attention_window = attention_window + self.seg_emb = seg_emb + self.text_masking = text_masking - # convert each batch to paddle.Tensor - text = paddle.to_tensor(text) + def __call__(self, data: Collection[Tuple[str, Dict[str, np.ndarray]]] + ) -> Tuple[List[str], Dict[str, paddle.Tensor]]: + return mlm_collate_fn( + data, + feats_extract=self.feats_extract, + mlm_prob=self.mlm_prob, + mean_phn_span=self.mean_phn_span, + seg_emb=self.seg_emb, + text_masking=self.text_masking, + not_sequence=self.not_sequence) + + +def mlm_collate_fn( + data: Collection[Tuple[str, Dict[str, np.ndarray]]], + feats_extract=None, + mlm_prob: float=0.8, + mean_phn_span: int=8, + seg_emb: bool=False, + text_masking: bool=False, + pad_value: int=0, + not_sequence: Collection[str]=(), +) -> Tuple[List[str], Dict[str, paddle.Tensor]]: + uttids = [u for u, _ in data] + data = [d for _, d in data] + + assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching" + assert all(not k.endswith("_lens") + for k in data[0]), f"*_lens is reserved: {list(data[0])}" + + output = {} + for key in data[0]: + + array_list = [d[key] for d in data] + + # Assume the first axis is length: + # tensor_list: Batch x (Length, ...) + tensor_list = [paddle.to_tensor(a) for a in array_list] + # tensor: (Batch, Length, ...) + tensor = pad_list(tensor_list, pad_value) + output[key] = tensor + + # lens: (Batch,) + if key not in not_sequence: + lens = paddle.to_tensor( + [d[key].shape[0] for d in data], dtype=paddle.int64) + output[key + "_lens"] = lens + + feats = feats_extract.get_log_mel_fbank(np.array(output["speech"][0])) feats = paddle.to_tensor(feats) - text_lengths = paddle.to_tensor(text_lengths) - feats_lengths = paddle.to_tensor(feats_lengths) + print("feats.shape:", feats.shape) + feats_lens = paddle.shape(feats)[0] + feats = paddle.unsqueeze(feats, 0) - batch = { - "text": text, - "text_lengths": text_lengths, - "feats": feats, - "feats_lengths": feats_lengths, - "speech": speech - } - # spk_emb has a higher priority than spk_id - if "spk_emb" in examples[0]: - spk_emb = [ - np.array(item["spk_emb"], dtype=np.float32) for item in examples - ] - spk_emb = batch_sequences(spk_emb) - spk_emb = paddle.to_tensor(spk_emb) - batch["spk_emb"] = spk_emb - elif "spk_id" in examples[0]: - spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples] - spk_id = paddle.to_tensor(spk_id) - batch["spk_id"] = spk_id - return batch + text = output["text"] + text_lens = output["text_lens"] + align_start = output["align_start"] + align_start_lens = output["align_start_lens"] + align_end = output["align_end"] + + max_tlen = max(text_lens) + max_slen = max(feats_lens) + + speech_pad = feats[:, :max_slen] + + text_pad = text + text_mask = make_non_pad_mask( + text_lens, text_pad, length_dim=1).unsqueeze(-2) + speech_mask = make_non_pad_mask( + feats_lens, speech_pad[:, :, 0], length_dim=1).unsqueeze(-2) + + span_bdy = None + if 'span_bdy' in output.keys(): + span_bdy = output['span_bdy'] + + # dual_mask 的是混合中英时候同时 mask 语音和文本 + # ernie sat 在实现跨语言的时候都 mask 了 + if text_masking: + masked_pos, text_masked_pos = phones_text_masking( + xs_pad=speech_pad, + src_mask=speech_mask, + text_pad=text_pad, + text_mask=text_mask, + align_start=align_start, + align_end=align_end, + align_start_lens=align_start_lens, + mlm_prob=mlm_prob, + mean_phn_span=mean_phn_span, + span_bdy=span_bdy) + # 训练纯中文和纯英文的 -> a3t 没有对 phoneme 做 mask, 只对语音 mask 了 + # a3t 和 ernie sat 的区别主要在于做 mask 的时候 + else: + masked_pos = phones_masking( + xs_pad=speech_pad, + src_mask=speech_mask, + align_start=align_start, + align_end=align_end, + align_start_lens=align_start_lens, + mlm_prob=mlm_prob, + mean_phn_span=mean_phn_span, + span_bdy=span_bdy) + text_masked_pos = paddle.zeros(paddle.shape(text_pad)) + + output_dict = {} + + speech_seg_pos, text_seg_pos = get_seg_pos( + speech_pad=speech_pad, + text_pad=text_pad, + align_start=align_start, + align_end=align_end, + align_start_lens=align_start_lens, + seg_emb=seg_emb) + output_dict['speech'] = speech_pad + output_dict['text'] = text_pad + output_dict['masked_pos'] = masked_pos + output_dict['text_masked_pos'] = text_masked_pos + output_dict['speech_mask'] = speech_mask + output_dict['text_mask'] = text_mask + output_dict['speech_seg_pos'] = speech_seg_pos + output_dict['text_seg_pos'] = text_seg_pos + output = (uttids, output_dict) + return output + + +def build_mlm_collate_fn( + sr: int=24000, + n_fft: int=2048, + hop_length: int=300, + win_length: int=None, + n_mels: int=80, + fmin: int=80, + fmax: int=7600, + mlm_prob: float=0.8, + mean_phn_span: int=8, + seg_emb: bool=False, + epoch: int=-1, ): + feats_extract_class = LogMelFBank + + feats_extract = feats_extract_class( + sr=sr, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + n_mels=n_mels, + fmin=fmin, + fmax=fmax) + + if epoch == -1: + mlm_prob_factor = 1 + else: + mlm_prob_factor = 0.8 + + return MLMCollateFn( + feats_extract=feats_extract, + mlm_prob=mlm_prob * mlm_prob_factor, + mean_phn_span=mean_phn_span, + seg_emb=seg_emb) diff --git a/paddlespeech/t2s/datasets/sampler.py b/paddlespeech/t2s/datasets/sampler.py index cbc9764c5..a69bc8600 100644 --- a/paddlespeech/t2s/datasets/sampler.py +++ b/paddlespeech/t2s/datasets/sampler.py @@ -1,9 +1,8 @@ +import paddle import math - import numpy as np from paddle.io import BatchSampler - class ErnieSATSampler(BatchSampler): """Sampler that restricts data loading to a subset of the dataset. In such case, each process can pass a DistributedBatchSampler instance @@ -71,7 +70,7 @@ class ErnieSATSampler(BatchSampler): assert isinstance(drop_last, bool), \ "drop_last should be a boolean number" - from paddle.distributed import ParallelEnv + from paddle.fluid.dygraph.parallel import ParallelEnv if num_replicas is not None: assert isinstance(num_replicas, int) and num_replicas > 0, \ @@ -111,8 +110,8 @@ class ErnieSATSampler(BatchSampler): subsampled_indices.extend(indices[i:i + self.batch_size]) indices = indices[len(indices) - last_batch_size:] - subsampled_indices.extend( - indices[self.local_rank * last_local_batch_size:( + subsampled_indices.extend(indices[ + self.local_rank * last_local_batch_size:( self.local_rank + 1) * last_local_batch_size]) return subsampled_indices diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py index 464f51a3b..529a8221c 100755 --- a/paddlespeech/t2s/exps/ernie_sat/align.py +++ b/paddlespeech/t2s/exps/ernie_sat/align.py @@ -19,9 +19,9 @@ import librosa import numpy as np import pypinyin from praatio import textgrid - -from paddlespeech.t2s.exps.ernie_sat.utils import get_dict from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name +from paddlespeech.t2s.exps.ernie_sat.utils import get_dict + DICT_EN = 'tools/aligner/cmudict-0.7b' DICT_ZH = 'tools/aligner/simple.lexicon' @@ -30,7 +30,6 @@ MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip' MFA_PATH = 'tools/montreal-forced-aligner/bin' os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH'] - def _get_max_idx(dic): return sorted([int(key.split('_')[0]) for key in dic.keys()])[-1] @@ -107,11 +106,11 @@ def alignment(wav_path: str, wav_name = os.path.basename(wav_path) utt = wav_name.split('.')[0] # prepare data for MFA - tmp_name = get_tmp_name(text=text) + tmp_name = get_tmp_name(text=text) tmpbase = './tmp_dir/' + tmp_name tmpbase = Path(tmpbase) tmpbase.mkdir(parents=True, exist_ok=True) - print("tmp_name in alignment:", tmp_name) + print("tmp_name in alignment:",tmp_name) shutil.copyfile(wav_path, tmpbase / wav_name) txt_name = utt + '.txt' @@ -341,7 +340,7 @@ def get_phns_spans(wav_path: str, if __name__ == '__main__': text = "For that reason cover should not be given." - phn, dur, word2phns = alignment("source/p243_313.wav", text, lang='en') + phn, dur, word2phns = alignment("exp/p243_313.wav", text, lang='en') print(phn, dur) print(word2phns) print("---------------------------------") @@ -353,7 +352,7 @@ if __name__ == '__main__': style=pypinyin.Style.TONE3, tone_sandhi=True) text_zh = " ".join(text_zh) - phn, dur, word2phns = alignment("source/000001.wav", text_zh, lang='zh') + phn, dur, word2phns = alignment("exp/000001.wav", text_zh, lang='zh') print(phn, dur) print(word2phns) print("---------------------------------") @@ -368,7 +367,7 @@ if __name__ == '__main__': print("---------------------------------") outs = get_phns_spans( - wav_path="source/p243_313.wav", + wav_path="exp/p243_313.wav", old_str="For that reason cover should not be given.", new_str="for that reason cover is impossible to be given.") diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py index 21c9ae044..95b07367c 100644 --- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py @@ -11,41 +11,35 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import argparse -import os -from pathlib import Path -from typing import List - import librosa import numpy as np -import paddle -import pypinyin import soundfile as sf -import yaml -from pypinyin_dict.phrase_pinyin_data import large_pinyin -from yacs.config import CfgNode -from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn -from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.exps.ernie_sat.align import get_phns_spans from paddlespeech.t2s.exps.ernie_sat.utils import eval_durs from paddlespeech.t2s.exps.ernie_sat.utils import get_dur_adj_factor from paddlespeech.t2s.exps.ernie_sat.utils import get_span_bdy -from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name -from paddlespeech.t2s.exps.syn_utils import get_am_inference -from paddlespeech.t2s.exps.syn_utils import get_voc_inference +from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.datasets.get_feats import LogMelFBank from paddlespeech.t2s.exps.syn_utils import norm -from paddlespeech.t2s.utils import str2bool -large_pinyin.load() +from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name -def _p2id(phonemes: List[str]) -> np.ndarray: + + + + +def _p2id(self, phonemes: List[str]) -> np.ndarray: # replace unk phone with sp - phonemes = [phn if phn in vocab_phones else "sp" for phn in phonemes] + phonemes = [ + phn if phn in vocab_phones else "sp" for phn in phonemes + ] phone_ids = [vocab_phones[item] for item in phonemes] return np.array(phone_ids, np.int64) + def prep_feats_with_dur(wav_path: str, old_str: str='', new_str: str='', @@ -73,12 +67,12 @@ def prep_feats_with_dur(wav_path: str, fs=fs, n_shift=n_shift) - mfa_start = phns_spans_outs['mfa_start'] - mfa_end = phns_spans_outs['mfa_end'] - old_phns = phns_spans_outs['old_phns'] - new_phns = phns_spans_outs['new_phns'] - span_to_repl = phns_spans_outs['span_to_repl'] - span_to_add = phns_spans_outs['span_to_add'] + mfa_start = phns_spans_outs["mfa_start"] + mfa_end = phns_spans_outs["mfa_end"] + old_phns = phns_spans_outs["old_phns"] + new_phns = phns_spans_outs["new_phns"] + span_to_repl = phns_spans_outs["span_to_repl"] + span_to_add = phns_spans_outs["span_to_add"] # 中文的 phns 不一定都在 fastspeech2 的字典里, 用 sp 代替 if target_lang in {'en', 'zh'}: @@ -138,7 +132,7 @@ def prep_feats_with_dur(wav_path: str, [wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]]) # 音频是正常遮住了 - sf.write(str("mask_wav.wav"), new_wav, samplerate=fs) + sf.write(str("new_wav.wav"), new_wav, samplerate=fs) # 4. get old and new mel span to be mask old_span_bdy = get_span_bdy( @@ -158,6 +152,8 @@ def prep_feats_with_dur(wav_path: str, return outs + + def prep_feats(wav_path: str, old_str: str='', new_str: str='', @@ -167,7 +163,7 @@ def prep_feats(wav_path: str, fs: int=24000, n_shift: int=300): - with_dur_outs = prep_feats_with_dur( + outs = prep_feats_with_dur( wav_path=wav_path, old_str=old_str, new_str=new_str, @@ -180,240 +176,138 @@ def prep_feats(wav_path: str, wav_name = os.path.basename(wav_path) utt_id = wav_name.split('.')[0] - wav = with_dur_outs['new_wav'] - phns = with_dur_outs['new_phns'] - mfa_start = with_dur_outs['new_mfa_start'] - mfa_end = with_dur_outs['new_mfa_end'] - old_span_bdy = with_dur_outs['old_span_bdy'] - new_span_bdy = with_dur_outs['new_span_bdy'] + wav = outs['new_wav'] + phns = outs['new_phns'] + mfa_start = outs['new_mfa_start'] + mfa_end = outs['new_mfa_end'] + old_span_bdy = outs['old_span_bdy'] + new_span_bdy = outs['new_span_bdy'] span_bdy = np.array(new_span_bdy) + text = _p2id(phns) mel = mel_extractor.get_log_mel_fbank(wav) erniesat_mean, erniesat_std = np.load(erniesat_stat) normed_mel = norm(mel, erniesat_mean, erniesat_std) - tmp_name = get_tmp_name(text=old_str) + tmp_name = get_tmp_name(text=old_str) tmpbase = './tmp_dir/' + tmp_name tmpbase = Path(tmpbase) tmpbase.mkdir(parents=True, exist_ok=True) + print("tmp_name in synthesize_e2e:",tmp_name) mel_path = tmpbase / 'mel.npy' - np.save(mel_path, normed_mel) + print("mel_path:",mel_path) + np.save(mel_path, logmel) durations = [e - s for e, s in zip(mfa_end, mfa_start)] - text = _p2id(phns) - datum = { - "utt_id": utt_id, - "spk_id": 0, - "text": text, - "text_lengths": len(text), - "speech_lengths": len(normed_mel), - "durations": durations, - "speech": np.load(mel_path), - "align_start": mfa_start, + datum={ + "utt_id": utt_id, + "spk_id": 0, + "text": text, + "text_lengths": len(text), + "speech_lengths": 115, + "durations": durations, + "speech": mel_path, + "align_start": mfa_start, "align_end": mfa_end, "span_bdy": span_bdy } batch = collate_fn([datum]) - outs = dict() - outs['batch'] = batch - outs['old_span_bdy'] = old_span_bdy - outs['new_span_bdy'] = new_span_bdy - return outs - - -def get_mlm_output(wav_path: str, - old_str: str='', - new_str: str='', - source_lang: str='en', - target_lang: str='en', - duration_adjust: bool=True, - fs: int=24000, - n_shift: int=300): - - prep_feats_outs = prep_feats( + print("batch:",batch) + + return batch, old_span_bdy, new_span_bdy + + +def decode_with_model(mlm_model: nn.Layer, + collate_fn, + wav_path: str, + old_str: str='', + new_str: str='', + source_lang: str='en', + target_lang: str='en', + use_teacher_forcing: bool=False, + duration_adjust: bool=True, + fs: int=24000, + n_shift: int=300, + token_list: List[str]=[]): + batch, old_span_bdy, new_span_bdy = prep_feats( + source_lang=source_lang, + target_lang=target_lang, wav_path=wav_path, old_str=old_str, new_str=new_str, - source_lang=source_lang, - target_lang=target_lang, duration_adjust=duration_adjust, fs=fs, - n_shift=n_shift) + n_shift=n_shift, + token_list=token_list) + - batch = prep_feats_outs['batch'] - new_span_bdy = prep_feats_outs['new_span_bdy'] - old_span_bdy = prep_feats_outs['old_span_bdy'] - out_mels = erniesat_inference( - speech=batch['speech'], - text=batch['text'], - masked_pos=batch['masked_pos'], - speech_mask=batch['speech_mask'], - text_mask=batch['text_mask'], - speech_seg_pos=batch['speech_seg_pos'], - text_seg_pos=batch['text_seg_pos'], - span_bdy=new_span_bdy) + feats = collate_fn(batch)[1] + + if 'text_masked_pos' in feats.keys(): + feats.pop('text_masked_pos') + + output = mlm_model.inference( + text=feats['text'], + speech=feats['speech'], + masked_pos=feats['masked_pos'], + speech_mask=feats['speech_mask'], + text_mask=feats['text_mask'], + speech_seg_pos=feats['speech_seg_pos'], + text_seg_pos=feats['text_seg_pos'], + span_bdy=new_span_bdy, + use_teacher_forcing=use_teacher_forcing) # 拼接音频 - output_feat = paddle.concat(x=out_mels, axis=0) + output_feat = paddle.concat(x=output, axis=0) wav_org, _ = librosa.load(wav_path, sr=fs) - outs = dict() - outs['wav_org'] = wav_org - outs['output_feat'] = output_feat - outs['old_span_bdy'] = old_span_bdy - outs['new_span_bdy'] = new_span_bdy - - return outs + return wav_org, output_feat, old_span_bdy, new_span_bdy, fs, hop_length -def get_wav(wav_path: str, - source_lang: str='en', - target_lang: str='en', - old_str: str='', - new_str: str='', - duration_adjust: bool=True, - fs: int=24000, - n_shift: int=300): +if __name__ == '__main__': + fs = 24000 + n_shift = 300 + wav_path = "exp/p243_313.wav" + old_str = "For that reason cover should not be given." + # for edit + # new_str = "for that reason cover is impossible to be given." + # for synthesize + append_str = "do you love me i love you so much" + new_str = old_str + append_str - outs = get_mlm_output( + ''' + outs = prep_feats_with_dur( wav_path=wav_path, old_str=old_str, new_str=new_str, - source_lang=source_lang, - target_lang=target_lang, - duration_adjust=duration_adjust, fs=fs, n_shift=n_shift) - wav_org = outs['wav_org'] - output_feat = outs['output_feat'] + new_wav = outs['new_wav'] + new_phns = outs['new_phns'] + new_mfa_start = outs['new_mfa_start'] + new_mfa_end = outs['new_mfa_end'] old_span_bdy = outs['old_span_bdy'] new_span_bdy = outs['new_span_bdy'] - masked_feat = output_feat[new_span_bdy[0]:new_span_bdy[1]] - - with paddle.no_grad(): - alt_wav = voc_inference(masked_feat) - alt_wav = np.squeeze(alt_wav) - - old_time_bdy = [n_shift * x for x in old_span_bdy] - wav_replaced = np.concatenate( - [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]]) - - wav_dict = {"origin": wav_org, "output": wav_replaced} - return wav_dict - - -def parse_args(): - # parse args and config - parser = argparse.ArgumentParser( - description="Synthesize with acoustic model & vocoder") - # ernie sat - - parser.add_argument( - '--erniesat_config', - type=str, - default=None, - help='Config of acoustic model.') - parser.add_argument( - '--erniesat_ckpt', - type=str, - default=None, - help='Checkpoint file of acoustic model.') - parser.add_argument( - "--erniesat_stat", - type=str, - default=None, - help="mean and standard deviation used to normalize spectrogram when training acoustic model." - ) - parser.add_argument( - "--phones_dict", type=str, default=None, help="phone vocabulary file.") - # vocoder - parser.add_argument( - '--voc', - type=str, - default='pwgan_csmsc', - choices=[ - 'pwgan_aishell3', - 'pwgan_vctk', - 'hifigan_aishell3', - 'hifigan_vctk', - ], - help='Choose vocoder type of tts task.') - parser.add_argument( - '--voc_config', type=str, default=None, help='Config of voc.') - parser.add_argument( - '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') - parser.add_argument( - "--voc_stat", - type=str, - default=None, - help="mean and standard deviation used to normalize spectrogram when training voc." - ) - # other - parser.add_argument( - "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") - - # ernie sat related - parser.add_argument("--task_name", type=str, help="task name") - parser.add_argument("--wav_path", type=str, help="path of old wav") - parser.add_argument("--old_str", type=str, help="old string") - parser.add_argument("--new_str", type=str, help="new string") - parser.add_argument( - "--source_lang", type=str, default="en", help="source language") - parser.add_argument( - "--target_lang", type=str, default="en", help="target language") - parser.add_argument( - "--duration_adjust", - type=str2bool, - default=True, - help="whether to adjust duration.") - parser.add_argument("--output_name", type=str, default="output.wav") - - args = parser.parse_args() - return args - + print("---------------------------------") -if __name__ == '__main__': - args = parse_args() + print("new_wav:", new_wav) + print("new_phns:", new_phns) + print("new_mfa_start:", new_mfa_start) + print("new_mfa_end:", new_mfa_end) + print("old_span_bdy:", old_span_bdy) + print("new_span_bdy:", new_span_bdy) + print("---------------------------------") + ''' - if args.ngpu == 0: - paddle.set_device("cpu") - elif args.ngpu > 0: - paddle.set_device("gpu") - else: - print("ngpu should >= 0 !") + erniesat_config = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/local/default.yaml" - # evaluate(args) - with open(args.erniesat_config) as f: + with open(erniesat_config) as f: erniesat_config = CfgNode(yaml.safe_load(f)) - old_str = args.old_str - new_str = args.new_str - - # convert Chinese characters to pinyin - if args.source_lang == 'zh': - old_str = pypinyin.lazy_pinyin( - old_str, - neutral_tone_with_five=True, - style=pypinyin.Style.TONE3, - tone_sandhi=True) - old_str = ' '.join(old_str) - if args.target_lang == 'zh': - new_str = pypinyin.lazy_pinyin( - new_str, - neutral_tone_with_five=True, - style=pypinyin.Style.TONE3, - tone_sandhi=True) - new_str = ' '.join(new_str) - - if args.task_name == 'edit': - new_str = new_str - elif args.task_name == 'synthesize': - new_str = old_str + new_str - else: - new_str = old_str + new_str - print("new_str:", new_str) + + erniesat_stat = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/train/speech_stats.npy" # Extractor mel_extractor = LogMelFBank( @@ -425,51 +319,28 @@ if __name__ == '__main__': n_mels=erniesat_config.n_mels, fmin=erniesat_config.fmin, fmax=erniesat_config.fmax) + + collate_fn = build_erniesat_collate_fn( mlm_prob=erniesat_config.mlm_prob, mean_phn_span=erniesat_config.mean_phn_span, seg_emb=erniesat_config.model['enc_input_layer'] == 'sega_mlm', text_masking=False) - + + phones_dict='/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/phone_id_map.txt' vocab_phones = {} - with open(args.phones_dict, 'rt') as f: + with open(phones_dict, 'rt') as f: phn_id = [line.strip().split() for line in f.readlines()] for phn, id in phn_id: vocab_phones[phn] = int(id) - # ernie sat model - erniesat_inference = get_am_inference( - am='erniesat_dataset', - am_config=erniesat_config, - am_ckpt=args.erniesat_ckpt, - am_stat=args.erniesat_stat, - phones_dict=args.phones_dict) - - with open(args.voc_config) as f: - voc_config = CfgNode(yaml.safe_load(f)) - - # vocoder - voc_inference = get_voc_inference( - voc=args.voc, - voc_config=voc_config, - voc_ckpt=args.voc_ckpt, - voc_stat=args.voc_stat) - - erniesat_stat = args.erniesat_stat - - wav_dict = get_wav( - wav_path=args.wav_path, - source_lang=args.source_lang, - target_lang=args.target_lang, - old_str=old_str, - new_str=new_str, - duration_adjust=args.duration_adjust, - fs=erniesat_config.fs, - n_shift=erniesat_config.n_shift) - - sf.write( - args.output_name, wav_dict['output'], samplerate=erniesat_config.fs) - print( - f"\033[1;32;m Generated audio saved into {args.output_name} ! \033[0m") + prep_feats(wav_path=wav_path, + old_str=old_str, + new_str=new_str, + fs=fs, + n_shift=n_shift) + + + diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py index 75a666bb1..af653ef89 100644 --- a/paddlespeech/t2s/exps/ernie_sat/train.py +++ b/paddlespeech/t2s/exps/ernie_sat/train.py @@ -25,6 +25,7 @@ from paddle import DataParallel from paddle import distributed as dist from paddle import nn from paddle.io import DataLoader +from paddle.io import DistributedBatchSampler from paddle.optimizer import Adam from yacs.config import CfgNode diff --git a/paddlespeech/t2s/exps/ernie_sat/utils.py b/paddlespeech/t2s/exps/ernie_sat/utils.py index 6805e513c..9169efa36 100644 --- a/paddlespeech/t2s/exps/ernie_sat/utils.py +++ b/paddlespeech/t2s/exps/ernie_sat/utils.py @@ -11,35 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import hashlib -import os from pathlib import Path from typing import Dict from typing import List from typing import Union +import os import numpy as np import paddle import yaml from yacs.config import CfgNode +import hashlib + from paddlespeech.t2s.exps.syn_utils import get_am_inference from paddlespeech.t2s.exps.syn_utils import get_voc_inference - def _get_user(): return os.path.expanduser('~').split('/')[-1] - def str2md5(string): md5_val = hashlib.md5(string.encode('utf8')).hexdigest() return md5_val - -def get_tmp_name(text: str): +def get_tmp_name(text:str): return _get_user() + '_' + str(os.getpid()) + '_' + str2md5(text) - def get_dict(dictfile: str): word2phns_dict = {} with open(dictfile, 'r') as fid: diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py index 15d8dfb78..127e1a3ba 100644 --- a/paddlespeech/t2s/exps/syn_utils.py +++ b/paddlespeech/t2s/exps/syn_utils.py @@ -82,10 +82,6 @@ def denorm(data, mean, std): return data * std + mean -def norm(data, mean, std): - return (data - mean) / std - - def get_chunks(data, block_size: int, pad_size: int): data_len = data.shape[1] chunks = [] @@ -298,8 +294,8 @@ def am_to_static(am_inference, am_name = am[:am.rindex('_')] am_dataset = am[am.rindex('_') + 1:] if am_name == 'fastspeech2': - if am_dataset in {"aishell3", "vctk", - "mix"} and speaker_dict is not None: + if am_dataset in {"aishell3", "vctk", "mix" + } and speaker_dict is not None: am_inference = jit.to_static( am_inference, input_spec=[ @@ -311,8 +307,8 @@ def am_to_static(am_inference, am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)]) elif am_name == 'speedyspeech': - if am_dataset in {"aishell3", "vctk", - "mix"} and speaker_dict is not None: + if am_dataset in {"aishell3", "vctk", "mix" + } and speaker_dict is not None: am_inference = jit.to_static( am_inference, input_spec=[ diff --git a/paddlespeech/t2s/exps/vits/synthesize.py b/paddlespeech/t2s/exps/vits/synthesize.py index 968684b25..074b890f9 100644 --- a/paddlespeech/t2s/exps/vits/synthesize.py +++ b/paddlespeech/t2s/exps/vits/synthesize.py @@ -15,7 +15,6 @@ import argparse from pathlib import Path import jsonlines -import numpy as np import paddle import soundfile as sf import yaml @@ -24,7 +23,6 @@ from yacs.config import CfgNode from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.models.vits import VITS -from paddlespeech.t2s.utils import str2bool def evaluate(args): @@ -42,26 +40,8 @@ def evaluate(args): print(config) fields = ["utt_id", "text"] - converters = {} - - spk_num = None - if args.speaker_dict is not None: - print("multiple speaker vits!") - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - spk_num = len(spk_id) - fields += ["spk_id"] - elif args.voice_cloning: - print("Evaluating voice cloning!") - fields += ["spk_emb"] - else: - print("single speaker vits!") - print("spk_num:", spk_num) - test_dataset = DataTable( - data=test_metadata, - fields=fields, - converters=converters, ) + test_dataset = DataTable(data=test_metadata, fields=fields) with open(args.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] @@ -69,7 +49,6 @@ def evaluate(args): print("vocab_size:", vocab_size) odim = config.n_fft // 2 + 1 - config["model"]["generator_params"]["spks"] = spk_num vits = VITS(idim=vocab_size, odim=odim, **config["model"]) vits.set_state_dict(paddle.load(args.ckpt)["main_params"]) @@ -86,15 +65,7 @@ def evaluate(args): phone_ids = paddle.to_tensor(datum["text"]) with timer() as t: with paddle.no_grad(): - spk_emb = None - spk_id = None - # multi speaker - if args.voice_cloning and "spk_emb" in datum: - spk_emb = paddle.to_tensor(np.load(datum["spk_emb"])) - elif "spk_id" in datum: - spk_id = paddle.to_tensor(datum["spk_id"]) - out = vits.inference( - text=phone_ids, sids=spk_id, spembs=spk_emb) + out = vits.inference(text=phone_ids) wav = out["wav"] wav = wav.numpy() N += wav.size @@ -119,13 +90,6 @@ def parse_args(): '--ckpt', type=str, default=None, help='Checkpoint file of VITS.') parser.add_argument( "--phones_dict", type=str, default=None, help="phone vocabulary file.") - parser.add_argument( - "--speaker_dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - "--voice-cloning", - type=str2bool, - default=False, - help="whether training voice cloning model.") # other parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") diff --git a/paddlespeech/t2s/exps/vits/synthesize_e2e.py b/paddlespeech/t2s/exps/vits/synthesize_e2e.py index f9d10ea62..33a413751 100644 --- a/paddlespeech/t2s/exps/vits/synthesize_e2e.py +++ b/paddlespeech/t2s/exps/vits/synthesize_e2e.py @@ -42,23 +42,12 @@ def evaluate(args): # frontend frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict) - spk_num = None - if args.speaker_dict is not None: - print("multiple speaker vits!") - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - spk_num = len(spk_id) - else: - print("single speaker vits!") - print("spk_num:", spk_num) - with open(args.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) print("vocab_size:", vocab_size) odim = config.n_fft // 2 + 1 - config["model"]["generator_params"]["spks"] = spk_num vits = VITS(idim=vocab_size, odim=odim, **config["model"]) vits.set_state_dict(paddle.load(args.ckpt)["main_params"]) @@ -89,10 +78,7 @@ def evaluate(args): flags = 0 for i in range(len(phone_ids)): part_phone_ids = phone_ids[i] - spk_id = None - if spk_num is not None: - spk_id = paddle.to_tensor(args.spk_id) - out = vits.inference(text=part_phone_ids, sids=spk_id) + out = vits.inference(text=part_phone_ids) wav = out["wav"] if flags == 0: wav_all = wav @@ -123,13 +109,6 @@ def parse_args(): '--ckpt', type=str, default=None, help='Checkpoint file of VITS.') parser.add_argument( "--phones_dict", type=str, default=None, help="phone vocabulary file.") - parser.add_argument( - "--speaker_dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - '--spk_id', - type=int, - default=0, - help='spk id for multi speaker acoustic model') # other parser.add_argument( '--lang', diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py index c994faa5a..1a68d1326 100644 --- a/paddlespeech/t2s/exps/vits/train.py +++ b/paddlespeech/t2s/exps/vits/train.py @@ -28,7 +28,6 @@ from paddle.io import DistributedBatchSampler from paddle.optimizer import Adam from yacs.config import CfgNode -from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn from paddlespeech.t2s.datasets.am_batch_fn import vits_single_spk_batch_fn from paddlespeech.t2s.datasets.data_table import DataTable from paddlespeech.t2s.models.vits import VITS @@ -44,7 +43,6 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL from paddlespeech.t2s.training.optimizer import scheduler_classes from paddlespeech.t2s.training.seeding import seed_everything from paddlespeech.t2s.training.trainer import Trainer -from paddlespeech.t2s.utils import str2bool def train_sp(args, config): @@ -74,23 +72,6 @@ def train_sp(args, config): "wave": np.load, "feats": np.load, } - spk_num = None - if args.speaker_dict is not None: - print("multiple speaker vits!") - collate_fn = vits_multi_spk_batch_fn - with open(args.speaker_dict, 'rt') as f: - spk_id = [line.strip().split() for line in f.readlines()] - spk_num = len(spk_id) - fields += ["spk_id"] - elif args.voice_cloning: - print("Training voice cloning!") - collate_fn = vits_multi_spk_batch_fn - fields += ["spk_emb"] - converters["spk_emb"] = np.load - else: - print("single speaker vits!") - collate_fn = vits_single_spk_batch_fn - print("spk_num:", spk_num) # construct dataset for training and validation with jsonlines.open(args.train_metadata, 'r') as reader: @@ -119,16 +100,18 @@ def train_sp(args, config): drop_last=False) print("samplers done!") + train_batch_fn = vits_single_spk_batch_fn + train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, - collate_fn=collate_fn, + collate_fn=train_batch_fn, num_workers=config.num_workers) dev_dataloader = DataLoader( dev_dataset, batch_sampler=dev_sampler, - collate_fn=collate_fn, + collate_fn=train_batch_fn, num_workers=config.num_workers) print("dataloaders done!") @@ -138,7 +121,6 @@ def train_sp(args, config): print("vocab_size:", vocab_size) odim = config.n_fft // 2 + 1 - config["model"]["generator_params"]["spks"] = spk_num model = VITS(idim=vocab_size, odim=odim, **config["model"]) gen_parameters = model.generator.parameters() dis_parameters = model.discriminator.parameters() @@ -258,17 +240,6 @@ def main(): "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") parser.add_argument( "--phones-dict", type=str, default=None, help="phone vocabulary file.") - parser.add_argument( - "--speaker-dict", - type=str, - default=None, - help="speaker id map file for multiple speaker model.") - - parser.add_argument( - "--voice-cloning", - type=str2bool, - default=False, - help="whether training voice cloning model.") args = parser.parse_args() diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py index 80cfea4a6..b51a4d7bc 100644 --- a/paddlespeech/t2s/exps/voice_cloning.py +++ b/paddlespeech/t2s/exps/voice_cloning.py @@ -21,28 +21,13 @@ import soundfile as sf import yaml from yacs.config import CfgNode -from paddlespeech.cli.vector import VectorExecutor from paddlespeech.t2s.exps.syn_utils import get_am_inference from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.frontend.zh_frontend import Frontend -from paddlespeech.t2s.utils import str2bool from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder -def gen_random_embed(use_ecapa: bool=False): - if use_ecapa: - # Randomly generate numbers of -25 ~ 25, 192 is the dim of spk_emb - random_spk_emb = (-1 + 2 * np.random.rand(192)) * 25 - - # GE2E - else: - # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb - random_spk_emb = np.random.rand(256) * 0.2 - random_spk_emb = paddle.to_tensor(random_spk_emb, dtype='float32') - return random_spk_emb - - def voice_cloning(args): # Init body. with open(args.am_config) as f: @@ -56,47 +41,30 @@ def voice_cloning(args): print(am_config) print(voc_config) - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - - input_dir = Path(args.input_dir) - # speaker encoder - if args.use_ecapa: - vec_executor = VectorExecutor() - # warm up - vec_executor( - audio_file=input_dir / os.listdir(input_dir)[0], force_yes=True) - print("ECAPA-TDNN Done!") - # use GE2E - else: - p = SpeakerVerificationPreprocessor( - sampling_rate=16000, - audio_norm_target_dBFS=-30, - vad_window_length=30, - vad_moving_average_width=8, - vad_max_silence_length=6, - mel_window_length=25, - mel_window_step=10, - n_mels=40, - partial_n_frames=160, - min_pad_coverage=0.75, - partial_overlap_ratio=0.5) - print("Audio Processor Done!") - - speaker_encoder = LSTMSpeakerEncoder( - n_mels=40, num_layers=3, hidden_size=256, output_size=256) - speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path)) - speaker_encoder.eval() - print("GE2E Done!") + p = SpeakerVerificationPreprocessor( + sampling_rate=16000, + audio_norm_target_dBFS=-30, + vad_window_length=30, + vad_moving_average_width=8, + vad_max_silence_length=6, + mel_window_length=25, + mel_window_step=10, + n_mels=40, + partial_n_frames=160, + min_pad_coverage=0.75, + partial_overlap_ratio=0.5) + print("Audio Processor Done!") + + speaker_encoder = LSTMSpeakerEncoder( + n_mels=40, num_layers=3, hidden_size=256, output_size=256) + speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path)) + speaker_encoder.eval() + print("GE2E Done!") frontend = Frontend(phone_vocab_path=args.phones_dict) print("frontend done!") - sentence = args.text - input_ids = frontend.get_input_ids(sentence, merge_sentences=True) - phone_ids = input_ids["phone_ids"][0] - # acoustic model am_inference = get_am_inference( am=args.am, @@ -112,19 +80,26 @@ def voice_cloning(args): voc_ckpt=args.voc_ckpt, voc_stat=args.voc_stat) + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + + input_dir = Path(args.input_dir) + + sentence = args.text + + input_ids = frontend.get_input_ids(sentence, merge_sentences=True) + phone_ids = input_ids["phone_ids"][0] + for name in os.listdir(input_dir): utt_id = name.split(".")[0] ref_audio_path = input_dir / name - if args.use_ecapa: - spk_emb = vec_executor(audio_file=ref_audio_path, force_yes=True) - spk_emb = paddle.to_tensor(spk_emb) - # GE2E - else: - mel_sequences = p.extract_mel_partials( - p.preprocess_wav(ref_audio_path)) - with paddle.no_grad(): - spk_emb = speaker_encoder.embed_utterance( - paddle.to_tensor(mel_sequences)) + mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path)) + # print("mel_sequences: ", mel_sequences.shape) + with paddle.no_grad(): + spk_emb = speaker_encoder.embed_utterance( + paddle.to_tensor(mel_sequences)) + # print("spk_emb shape: ", spk_emb.shape) + with paddle.no_grad(): wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb)) @@ -133,17 +108,16 @@ def voice_cloning(args): wav.numpy(), samplerate=am_config.fs) print(f"{utt_id} done!") - - # generate 5 random_spk_emb - for i in range(5): - random_spk_emb = gen_random_embed(args.use_ecapa) - utt_id = "random_spk_emb" - with paddle.no_grad(): - wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb)) - sf.write( - str(output_dir / (utt_id + "_" + str(i) + ".wav")), - wav.numpy(), - samplerate=am_config.fs) + # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb + random_spk_emb = np.random.rand(256) * 0.2 + random_spk_emb = paddle.to_tensor(random_spk_emb, dtype='float32') + utt_id = "random_spk_emb" + with paddle.no_grad(): + wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb)) + sf.write( + str(output_dir / (utt_id + ".wav")), + wav.numpy(), + samplerate=am_config.fs) print(f"{utt_id} done!") @@ -197,15 +171,13 @@ def parse_args(): type=str, default="每当你觉得,想要批评什么人的时候,你切要记着,这个世界上的人,并非都具备你禀有的条件。", help="text to synthesize, a line") + parser.add_argument( "--ge2e_params_path", type=str, help="ge2e params path.") - parser.add_argument( - "--use_ecapa", - type=str2bool, - default=False, - help="whether to use ECAPA-TDNN as speaker encoder.") + parser.add_argument( "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.") + parser.add_argument( "--input-dir", type=str, diff --git a/paddlespeech/t2s/frontend/g2pw/__init__.py b/paddlespeech/t2s/frontend/g2pw/__init__.py index 0eaeee5df..6e1ee0db8 100644 --- a/paddlespeech/t2s/frontend/g2pw/__init__.py +++ b/paddlespeech/t2s/frontend/g2pw/__init__.py @@ -1 +1,2 @@ from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter + diff --git a/paddlespeech/t2s/frontend/g2pw/dataset.py b/paddlespeech/t2s/frontend/g2pw/dataset.py index 98af5f463..ab715dc36 100644 --- a/paddlespeech/t2s/frontend/g2pw/dataset.py +++ b/paddlespeech/t2s/frontend/g2pw/dataset.py @@ -81,12 +81,12 @@ def prepare_onnx_input(tokenizer, position_ids.append(position_id) outputs = { - 'input_ids': np.array(input_ids).astype(np.int64), - 'token_type_ids': np.array(token_type_ids).astype(np.int64), - 'attention_masks': np.array(attention_masks).astype(np.int64), + 'input_ids': np.array(input_ids), + 'token_type_ids': np.array(token_type_ids), + 'attention_masks': np.array(attention_masks), 'phoneme_masks': np.array(phoneme_masks).astype(np.float32), - 'char_ids': np.array(char_ids).astype(np.int64), - 'position_ids': np.array(position_ids).astype(np.int64), + 'char_ids': np.array(char_ids), + 'position_ids': np.array(position_ids), } return outputs diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py index 180e8ae15..9e708ec88 100644 --- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py +++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py @@ -34,7 +34,7 @@ from paddlespeech.t2s.frontend.g2pw.utils import load_config from paddlespeech.t2s.frontend.zh_normalization.char_convert import tranditional_to_simplified from paddlespeech.utils.env import MODEL_HOME -model_version = '1.1' +model_version = '1.0' def predict(session, onnx_input, labels): diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py index 101a1e503..6868d3357 100644 --- a/paddlespeech/t2s/frontend/mix_frontend.py +++ b/paddlespeech/t2s/frontend/mix_frontend.py @@ -61,11 +61,7 @@ class MixFrontend(): return False def is_end(self, before_char, after_char) -> bool: - flag = 0 - for char in (before_char, after_char): - if self.is_alphabet(char) or char == " ": - flag += 1 - if flag == 2: + if ((self.is_alphabet(before_char) or before_char == " ") and (self.is_alphabet(after_char) or after_char == " ")): return True else: return False @@ -90,11 +86,10 @@ class MixFrontend(): if point_index == 0 or point_index == len(text) - 1: new_text = text else: - if not self.is_end(text[point_index - 1], text[point_index + - 1]): + if not self.is_end(text[point_index - 1], text[point_index + 1]): new_text = text else: - new_text = text[:point_index] + "。" + text[point_index + 1:] + new_text = text[: point_index] + "。" + text[point_index + 1:] elif len(point_indexs) == 2: first_index = point_indexs[0] @@ -102,8 +97,7 @@ class MixFrontend(): # first if first_index != 0: - if not self.is_end(text[first_index - 1], text[first_index + - 1]): + if not self.is_end(text[first_index - 1], text[first_index + 1]): new_text += (text[:first_index] + ".") else: new_text += (text[:first_index] + "。") @@ -112,20 +106,18 @@ class MixFrontend(): # last if end_index != len(text) - 1: if not self.is_end(text[end_index - 1], text[end_index + 1]): - new_text += text[point_indexs[-2] + 1:] + new_text += text[point_indexs[-2] + 1 : ] else: - new_text += (text[point_indexs[-2] + 1:end_index] + "。" + - text[end_index + 1:]) + new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ]) else: - new_text += "." + new_text += "." else: first_index = point_indexs[0] end_index = point_indexs[-1] # first if first_index != 0: - if not self.is_end(text[first_index - 1], text[first_index + - 1]): + if not self.is_end(text[first_index - 1], text[first_index + 1]): new_text += (text[:first_index] + ".") else: new_text += (text[:first_index] + "。") @@ -134,20 +126,16 @@ class MixFrontend(): # middle for j in range(1, len(point_indexs) - 1): point_index = point_indexs[j] - if not self.is_end(text[point_index - 1], text[point_index + - 1]): - new_text += ( - text[point_indexs[j - 1] + 1:point_index] + ".") + if not self.is_end(text[point_index - 1], text[point_index + 1]): + new_text += (text[point_indexs[j-1] + 1 : point_index] + ".") else: - new_text += ( - text[point_indexs[j - 1] + 1:point_index] + "。") + new_text += (text[point_indexs[j-1] + 1 : point_index] + "。") # last if end_index != len(text) - 1: if not self.is_end(text[end_index - 1], text[end_index + 1]): - new_text += text[point_indexs[-2] + 1:] + new_text += text[point_indexs[-2] + 1 : ] else: - new_text += (text[point_indexs[-2] + 1:end_index] + "。" + - text[end_index + 1:]) + new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ]) else: new_text += "." @@ -236,7 +224,7 @@ class MixFrontend(): def get_input_ids(self, sentence: str, - merge_sentences: bool=False, + merge_sentences: bool=True, get_tone_ids: bool=False, add_sp: bool=True, to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]: @@ -244,29 +232,28 @@ class MixFrontend(): sentences = self._split(sentence) phones_list = [] result = {} + for text in sentences: phones_seg = [] segments = self._distinguish(text) for seg in segments: content = seg[0] lang = seg[1] - if content != '': - if lang == "en": - input_ids = self.en_frontend.get_input_ids( - content, merge_sentences=True, to_tensor=to_tensor) - else: - input_ids = self.zh_frontend.get_input_ids( - content, - merge_sentences=True, - get_tone_ids=get_tone_ids, - to_tensor=to_tensor) - - phones_seg.append(input_ids["phone_ids"][0]) - if add_sp: - phones_seg.append(self.sp_id_tensor) - - if phones_seg == []: - phones_seg.append(self.sp_id_tensor) + if lang == "zh": + input_ids = self.zh_frontend.get_input_ids( + content, + merge_sentences=True, + get_tone_ids=get_tone_ids, + to_tensor=to_tensor) + + elif lang == "en": + input_ids = self.en_frontend.get_input_ids( + content, merge_sentences=True, to_tensor=to_tensor) + + phones_seg.append(input_ids["phone_ids"][0]) + if add_sp: + phones_seg.append(self.sp_id_tensor) + phones = paddle.concat(phones_seg) phones_list.append(phones) diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml index 51b76f23f..2c7cf33fb 100644 --- a/paddlespeech/t2s/frontend/polyphonic.yaml +++ b/paddlespeech/t2s/frontend/polyphonic.yaml @@ -42,8 +42,3 @@ polyphonic: 咖喱: ['ga1','li5'] 时分: ['shi2','fen1'] 蚌埠: ['beng4','bu4'] - 驯服: ['xun4','fu2'] - 幸免于难: ['xing4','mian3','yu2','nan4'] - 恶行: ['e4','xing2'] - 唉: ['ai4'] - diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py index 9fff4272c..e5ef617a9 100644 --- a/paddlespeech/t2s/frontend/tone_sandhi.py +++ b/paddlespeech/t2s/frontend/tone_sandhi.py @@ -42,7 +42,7 @@ class ToneSandhi(): '木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾', '收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼', '抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担', - '戒指', '懒得', '意识', '意思', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '别人', + '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事', '帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股', '尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在', @@ -60,7 +60,7 @@ class ToneSandhi(): '邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划', '扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜', '糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记', '戏弄', - '将军' + '将军', '别人' } self.must_not_neural_tone_words = { '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎', @@ -84,7 +84,7 @@ class ToneSandhi(): if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}: finals[j] = finals[j][:-1] + "5" ge_idx = word.find("个") - if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒滴哩哟喽啰耶喔诶": + if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶": finals[-1] = finals[-1][:-1] + "5" elif len(word) >= 1 and word[-1] in "的地得": finals[-1] = finals[-1][:-1] + "5" @@ -169,7 +169,6 @@ class ToneSandhi(): return new_word_list def _three_sandhi(self, word: str, finals: List[str]) -> List[str]: - if len(word) == 2 and self._all_tone_three(finals): finals[0] = finals[0][:-1] + "2" elif len(word) == 3: @@ -347,7 +346,6 @@ class ToneSandhi(): def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]: - finals = self._bu_sandhi(word, finals) finals = self._yi_sandhi(word, finals) finals = self._neural_sandhi(word, pos, finals) diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py index 8a54d3e63..ec1367736 100644 --- a/paddlespeech/t2s/frontend/zh_normalization/num.py +++ b/paddlespeech/t2s/frontend/zh_normalization/num.py @@ -28,7 +28,7 @@ UNITS = OrderedDict({ 8: '亿', }) -COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)' +COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)' # 分数表达式 RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)') diff --git a/paddlespeech/t2s/models/ernie_sat/__init__.py b/paddlespeech/t2s/models/ernie_sat/__init__.py index 87e7afe85..7e795370e 100644 --- a/paddlespeech/t2s/models/ernie_sat/__init__.py +++ b/paddlespeech/t2s/models/ernie_sat/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from .ernie_sat import * from .ernie_sat_updater import * +from .mlm import * diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py index 08c43dc5f..54f5d542d 100644 --- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py +++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py @@ -389,7 +389,7 @@ class MLM(nn.Layer): speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor, span_bdy: List[int], - use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]: + use_teacher_forcing: bool=False, ) -> List[paddle.Tensor]: ''' Args: speech (paddle.Tensor): input speech (1, Tmax, D). @@ -657,7 +657,7 @@ class ErnieSAT(nn.Layer): speech_seg_pos: paddle.Tensor, text_seg_pos: paddle.Tensor, span_bdy: List[int], - use_teacher_forcing: bool=True, ) -> Dict[str, paddle.Tensor]: + use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: return self.model.inference( speech=speech, text=text, diff --git a/paddlespeech/t2s/models/ernie_sat/mlm.py b/paddlespeech/t2s/models/ernie_sat/mlm.py new file mode 100644 index 000000000..647fdd9b4 --- /dev/null +++ b/paddlespeech/t2s/models/ernie_sat/mlm.py @@ -0,0 +1,579 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +from typing import Dict +from typing import List +from typing import Optional + +import paddle +import yaml +from paddle import nn +from yacs.config import CfgNode + +from paddlespeech.t2s.modules.activation import get_activation +from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule +from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer +from paddlespeech.t2s.modules.layer_norm import LayerNorm +from paddlespeech.t2s.modules.masked_fill import masked_fill +from paddlespeech.t2s.modules.nets_utils import initialize +from paddlespeech.t2s.modules.tacotron2.decoder import Postnet +from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention +from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention +from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention +from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding +from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding +from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding +from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding +from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear +from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d +from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward +from paddlespeech.t2s.modules.transformer.repeat import repeat +from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling + + +# MLM -> Mask Language Model +class mySequential(nn.Sequential): + def forward(self, *inputs): + for module in self._sub_layers.values(): + if type(inputs) == tuple: + inputs = module(*inputs) + else: + inputs = module(inputs) + return inputs + + +class MaskInputLayer(nn.Layer): + def __init__(self, out_features: int) -> None: + super().__init__() + self.mask_feature = paddle.create_parameter( + shape=(1, 1, out_features), + dtype=paddle.float32, + default_initializer=paddle.nn.initializer.Assign( + paddle.normal(shape=(1, 1, out_features)))) + + def forward(self, input: paddle.Tensor, + masked_pos: paddle.Tensor=None) -> paddle.Tensor: + masked_pos = paddle.expand_as(paddle.unsqueeze(masked_pos, -1), input) + masked_input = masked_fill(input, masked_pos, 0) + masked_fill( + paddle.expand_as(self.mask_feature, input), ~masked_pos, 0) + return masked_input + + +class MLMEncoder(nn.Layer): + """Conformer encoder module. + + Args: + idim (int): Input dimension. + attention_dim (int): Dimension of attention. + attention_heads (int): The number of heads of multi head attention. + linear_units (int): The number of units of position-wise feed forward. + num_blocks (int): The number of decoder blocks. + dropout_rate (float): Dropout rate. + positional_dropout_rate (float): Dropout rate after adding positional encoding. + attention_dropout_rate (float): Dropout rate in attention. + input_layer (Union[str, paddle.nn.Layer]): Input layer type. + normalize_before (bool): Whether to use layer_norm before the first block. + concat_after (bool): Whether to concat attention layer's input and output. + if True, additional linear will be applied. + i.e. x -> x + linear(concat(x, att(x))) + if False, no additional linear will be applied. i.e. x -> x + att(x) + positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear". + positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer. + macaron_style (bool): Whether to use macaron style for positionwise layer. + pos_enc_layer_type (str): Encoder positional encoding layer type. + selfattention_layer_type (str): Encoder attention layer type. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + zero_triu (bool): Whether to zero the upper triangular part of attention matrix. + cnn_module_kernel (int): Kernerl size of convolution module. + padding_idx (int): Padding idx for input_layer=embed. + stochastic_depth_rate (float): Maximum probability to skip the encoder layer. + + """ + + def __init__(self, + idim: int, + vocab_size: int=0, + pre_speech_layer: int=0, + attention_dim: int=256, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + attention_dropout_rate: float=0.0, + input_layer: str="conv2d", + normalize_before: bool=True, + concat_after: bool=False, + positionwise_layer_type: str="linear", + positionwise_conv_kernel_size: int=1, + macaron_style: bool=False, + pos_enc_layer_type: str="abs_pos", + selfattention_layer_type: str="selfattn", + activation_type: str="swish", + use_cnn_module: bool=False, + zero_triu: bool=False, + cnn_module_kernel: int=31, + padding_idx: int=-1, + stochastic_depth_rate: float=0.0, + text_masking: bool=False): + """Construct an Encoder object.""" + super().__init__() + self._output_size = attention_dim + self.text_masking = text_masking + if self.text_masking: + self.text_masking_layer = MaskInputLayer(attention_dim) + activation = get_activation(activation_type) + if pos_enc_layer_type == "abs_pos": + pos_enc_class = PositionalEncoding + elif pos_enc_layer_type == "scaled_abs_pos": + pos_enc_class = ScaledPositionalEncoding + elif pos_enc_layer_type == "rel_pos": + assert selfattention_layer_type == "rel_selfattn" + pos_enc_class = RelPositionalEncoding + elif pos_enc_layer_type == "legacy_rel_pos": + pos_enc_class = LegacyRelPositionalEncoding + assert selfattention_layer_type == "legacy_rel_selfattn" + else: + raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) + + self.conv_subsampling_factor = 1 + if input_layer == "linear": + self.embed = nn.Sequential( + nn.Linear(idim, attention_dim), + nn.LayerNorm(attention_dim), + nn.Dropout(dropout_rate), + nn.ReLU(), + pos_enc_class(attention_dim, positional_dropout_rate), ) + elif input_layer == "conv2d": + self.embed = Conv2dSubsampling( + idim, + attention_dim, + dropout_rate, + pos_enc_class(attention_dim, positional_dropout_rate), ) + self.conv_subsampling_factor = 4 + elif input_layer == "embed": + self.embed = nn.Sequential( + nn.Embedding(idim, attention_dim, padding_idx=padding_idx), + pos_enc_class(attention_dim, positional_dropout_rate), ) + elif input_layer == "mlm": + self.segment_emb = None + self.speech_embed = mySequential( + MaskInputLayer(idim), + nn.Linear(idim, attention_dim), + nn.LayerNorm(attention_dim), + nn.ReLU(), + pos_enc_class(attention_dim, positional_dropout_rate)) + self.text_embed = nn.Sequential( + nn.Embedding( + vocab_size, attention_dim, padding_idx=padding_idx), + pos_enc_class(attention_dim, positional_dropout_rate), ) + elif input_layer == "sega_mlm": + self.segment_emb = nn.Embedding( + 500, attention_dim, padding_idx=padding_idx) + self.speech_embed = mySequential( + MaskInputLayer(idim), + nn.Linear(idim, attention_dim), + nn.LayerNorm(attention_dim), + nn.ReLU(), + pos_enc_class(attention_dim, positional_dropout_rate)) + self.text_embed = nn.Sequential( + nn.Embedding( + vocab_size, attention_dim, padding_idx=padding_idx), + pos_enc_class(attention_dim, positional_dropout_rate), ) + elif isinstance(input_layer, nn.Layer): + self.embed = nn.Sequential( + input_layer, + pos_enc_class(attention_dim, positional_dropout_rate), ) + elif input_layer is None: + self.embed = nn.Sequential( + pos_enc_class(attention_dim, positional_dropout_rate)) + else: + raise ValueError("unknown input_layer: " + input_layer) + self.normalize_before = normalize_before + + # self-attention module definition + if selfattention_layer_type == "selfattn": + encoder_selfattn_layer = MultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, attention_dim, + attention_dropout_rate, ) + elif selfattention_layer_type == "legacy_rel_selfattn": + assert pos_enc_layer_type == "legacy_rel_pos" + encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, attention_dim, + attention_dropout_rate, ) + elif selfattention_layer_type == "rel_selfattn": + assert pos_enc_layer_type == "rel_pos" + encoder_selfattn_layer = RelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, attention_dim, + attention_dropout_rate, zero_triu, ) + else: + raise ValueError("unknown encoder_attn_layer: " + + selfattention_layer_type) + + # feed-forward module definition + if positionwise_layer_type == "linear": + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = (attention_dim, linear_units, + dropout_rate, activation, ) + elif positionwise_layer_type == "conv1d": + positionwise_layer = MultiLayeredConv1d + positionwise_layer_args = (attention_dim, linear_units, + positionwise_conv_kernel_size, + dropout_rate, ) + elif positionwise_layer_type == "conv1d-linear": + positionwise_layer = Conv1dLinear + positionwise_layer_args = (attention_dim, linear_units, + positionwise_conv_kernel_size, + dropout_rate, ) + else: + raise NotImplementedError("Support only linear or conv1d.") + + # convolution module definition + convolution_layer = ConvolutionModule + convolution_layer_args = (attention_dim, cnn_module_kernel, activation) + + self.encoders = repeat( + num_blocks, + lambda lnum: EncoderLayer( + attention_dim, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + positionwise_layer(*positionwise_layer_args) if macaron_style else None, + convolution_layer(*convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + concat_after, + stochastic_depth_rate * float(1 + lnum) / num_blocks, ), ) + self.pre_speech_layer = pre_speech_layer + self.pre_speech_encoders = repeat( + self.pre_speech_layer, + lambda lnum: EncoderLayer( + attention_dim, + encoder_selfattn_layer(*encoder_selfattn_layer_args), + positionwise_layer(*positionwise_layer_args), + positionwise_layer(*positionwise_layer_args) if macaron_style else None, + convolution_layer(*convolution_layer_args) if use_cnn_module else None, + dropout_rate, + normalize_before, + concat_after, + stochastic_depth_rate * float(1 + lnum) / self.pre_speech_layer, ), + ) + if self.normalize_before: + self.after_norm = LayerNorm(attention_dim) + + def forward(self, + speech: paddle.Tensor, + text: paddle.Tensor, + masked_pos: paddle.Tensor, + speech_mask: paddle.Tensor=None, + text_mask: paddle.Tensor=None, + speech_seg_pos: paddle.Tensor=None, + text_seg_pos: paddle.Tensor=None): + """Encode input sequence. + + """ + if masked_pos is not None: + speech = self.speech_embed(speech, masked_pos) + else: + speech = self.speech_embed(speech) + if text is not None: + text = self.text_embed(text) + if speech_seg_pos is not None and text_seg_pos is not None and self.segment_emb: + speech_seg_emb = self.segment_emb(speech_seg_pos) + text_seg_emb = self.segment_emb(text_seg_pos) + text = (text[0] + text_seg_emb, text[1]) + speech = (speech[0] + speech_seg_emb, speech[1]) + if self.pre_speech_encoders: + speech, _ = self.pre_speech_encoders(speech, speech_mask) + + if text is not None: + xs = paddle.concat([speech[0], text[0]], axis=1) + xs_pos_emb = paddle.concat([speech[1], text[1]], axis=1) + masks = paddle.concat([speech_mask, text_mask], axis=-1) + else: + xs = speech[0] + xs_pos_emb = speech[1] + masks = speech_mask + + xs, masks = self.encoders((xs, xs_pos_emb), masks) + + if isinstance(xs, tuple): + xs = xs[0] + if self.normalize_before: + xs = self.after_norm(xs) + + return xs, masks + + +class MLMDecoder(MLMEncoder): + def forward(self, xs: paddle.Tensor, masks: paddle.Tensor): + """Encode input sequence. + + Args: + xs (paddle.Tensor): Input tensor (#batch, time, idim). + masks (paddle.Tensor): Mask tensor (#batch, time). + + Returns: + paddle.Tensor: Output tensor (#batch, time, attention_dim). + paddle.Tensor: Mask tensor (#batch, time). + + """ + xs = self.embed(xs) + xs, masks = self.encoders(xs, masks) + + if isinstance(xs, tuple): + xs = xs[0] + if self.normalize_before: + xs = self.after_norm(xs) + + return xs, masks + + +# encoder and decoder is nn.Layer, not str +class MLM(nn.Layer): + def __init__(self, + odim: int, + encoder: nn.Layer, + decoder: Optional[nn.Layer], + postnet_layers: int=0, + postnet_chans: int=0, + postnet_filts: int=0, + text_masking: bool=False): + + super().__init__() + self.odim = odim + self.encoder = encoder + self.decoder = decoder + self.vocab_size = encoder.text_embed[0]._num_embeddings + + if self.decoder is None or not (hasattr(self.decoder, + 'output_layer') and + self.decoder.output_layer is not None): + self.sfc = nn.Linear(self.encoder._output_size, odim) + else: + self.sfc = None + if text_masking: + self.text_sfc = nn.Linear( + self.encoder.text_embed[0]._embedding_dim, + self.vocab_size, + weight_attr=self.encoder.text_embed[0]._weight_attr) + else: + self.text_sfc = None + + self.postnet = (None if postnet_layers == 0 else Postnet( + idim=self.encoder._output_size, + odim=odim, + n_layers=postnet_layers, + n_chans=postnet_chans, + n_filts=postnet_filts, + use_batch_norm=True, + dropout_rate=0.5, )) + + def inference( + self, + speech: paddle.Tensor, + text: paddle.Tensor, + masked_pos: paddle.Tensor, + speech_mask: paddle.Tensor, + text_mask: paddle.Tensor, + speech_seg_pos: paddle.Tensor, + text_seg_pos: paddle.Tensor, + span_bdy: List[int], + use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]: + ''' + Args: + speech (paddle.Tensor): input speech (1, Tmax, D). + text (paddle.Tensor): input text (1, Tmax2). + masked_pos (paddle.Tensor): masked position of input speech (1, Tmax) + speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax). + text_mask (paddle.Tensor): mask of text (1, 1, Tmax2). + speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax). + text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2). + span_bdy (List[int]): masked mel boundary of input speech (2,) + use_teacher_forcing (bool): whether to use teacher forcing + Returns: + List[Tensor]: + eg: + [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])] + ''' + + z_cache = None + if use_teacher_forcing: + before_outs, zs, *_ = self.forward( + speech=speech, + text=text, + masked_pos=masked_pos, + speech_mask=speech_mask, + text_mask=text_mask, + speech_seg_pos=speech_seg_pos, + text_seg_pos=text_seg_pos) + if zs is None: + zs = before_outs + + speech = speech.squeeze(0) + outs = [speech[:span_bdy[0]]] + outs += [zs[0][span_bdy[0]:span_bdy[1]]] + outs += [speech[span_bdy[1]:]] + return outs + return None + + +class MLMEncAsDecoder(MLM): + def forward(self, + speech: paddle.Tensor, + text: paddle.Tensor, + masked_pos: paddle.Tensor, + speech_mask: paddle.Tensor, + text_mask: paddle.Tensor, + speech_seg_pos: paddle.Tensor, + text_seg_pos: paddle.Tensor): + # feats: (Batch, Length, Dim) + # -> encoder_out: (Batch, Length2, Dim2) + encoder_out, h_masks = self.encoder( + speech=speech, + text=text, + masked_pos=masked_pos, + speech_mask=speech_mask, + text_mask=text_mask, + speech_seg_pos=speech_seg_pos, + text_seg_pos=text_seg_pos) + if self.decoder is not None: + zs, _ = self.decoder(encoder_out, h_masks) + else: + zs = encoder_out + speech_hidden_states = zs[:, :paddle.shape(speech)[1], :] + if self.sfc is not None: + before_outs = paddle.reshape( + self.sfc(speech_hidden_states), + (paddle.shape(speech_hidden_states)[0], -1, self.odim)) + else: + before_outs = speech_hidden_states + if self.postnet is not None: + after_outs = before_outs + paddle.transpose( + self.postnet(paddle.transpose(before_outs, [0, 2, 1])), + [0, 2, 1]) + else: + after_outs = None + return before_outs, after_outs, None + + +class MLMDualMaksing(MLM): + def forward(self, + speech: paddle.Tensor, + text: paddle.Tensor, + masked_pos: paddle.Tensor, + speech_mask: paddle.Tensor, + text_mask: paddle.Tensor, + speech_seg_pos: paddle.Tensor, + text_seg_pos: paddle.Tensor): + # feats: (Batch, Length, Dim) + # -> encoder_out: (Batch, Length2, Dim2) + encoder_out, h_masks = self.encoder( + speech=speech, + text=text, + masked_pos=masked_pos, + speech_mask=speech_mask, + text_mask=text_mask, + speech_seg_pos=speech_seg_pos, + text_seg_pos=text_seg_pos) + if self.decoder is not None: + zs, _ = self.decoder(encoder_out, h_masks) + else: + zs = encoder_out + speech_hidden_states = zs[:, :paddle.shape(speech)[1], :] + if self.text_sfc: + text_hiddent_states = zs[:, paddle.shape(speech)[1]:, :] + text_outs = paddle.reshape( + self.text_sfc(text_hiddent_states), + (paddle.shape(text_hiddent_states)[0], -1, self.vocab_size)) + if self.sfc is not None: + before_outs = paddle.reshape( + self.sfc(speech_hidden_states), + (paddle.shape(speech_hidden_states)[0], -1, self.odim)) + else: + before_outs = speech_hidden_states + if self.postnet is not None: + after_outs = before_outs + paddle.transpose( + self.postnet(paddle.transpose(before_outs, [0, 2, 1])), + [0, 2, 1]) + else: + after_outs = None + return before_outs, after_outs, text_outs + + +def build_model_from_file(config_file, model_file): + + state_dict = paddle.load(model_file) + model_class = MLMDualMaksing if 'conformer_combine_vctk_aishell3_dual_masking' in config_file \ + else MLMEncAsDecoder + + # 构建模型 + with open(config_file) as f: + conf = CfgNode(yaml.safe_load(f)) + model = build_model(conf, model_class) + model.set_state_dict(state_dict) + return model, conf + + +# select encoder and decoder here +def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM: + if isinstance(args.token_list, str): + with open(args.token_list, encoding="utf-8") as f: + token_list = [line.rstrip() for line in f] + + # Overwriting token_list to keep it as "portable". + args.token_list = list(token_list) + elif isinstance(args.token_list, (tuple, list)): + token_list = list(args.token_list) + else: + raise RuntimeError("token_list must be str or list") + + vocab_size = len(token_list) + odim = 80 + + # Encoder + encoder_class = MLMEncoder + + if 'text_masking' in args.model_conf.keys() and args.model_conf[ + 'text_masking']: + args.encoder_conf['text_masking'] = True + else: + args.encoder_conf['text_masking'] = False + + encoder = encoder_class( + args.input_size, vocab_size=vocab_size, **args.encoder_conf) + + # Decoder + if args.decoder != 'no_decoder': + decoder_class = MLMDecoder + decoder = decoder_class( + idim=0, + input_layer=None, + **args.decoder_conf, ) + else: + decoder = None + + # Build model + model = model_class( + odim=odim, + encoder=encoder, + decoder=decoder, + **args.model_conf, ) + + # Initialize + if args.init is not None: + initialize(model, args.init) + + return model diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py index 359b66258..f87de91a2 100644 --- a/paddlespeech/t2s/models/vits/generator.py +++ b/paddlespeech/t2s/models/vits/generator.py @@ -522,82 +522,6 @@ class VITSGenerator(nn.Layer): return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1) - def voice_conversion( - self, - feats: paddle.Tensor=None, - feats_lengths: paddle.Tensor=None, - sids_src: Optional[paddle.Tensor]=None, - sids_tgt: Optional[paddle.Tensor]=None, - spembs_src: Optional[paddle.Tensor]=None, - spembs_tgt: Optional[paddle.Tensor]=None, - lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor: - """Run voice conversion. - Args: - feats (Tensor): Feature tensor (B, aux_channels, T_feats,). - feats_lengths (Tensor): Feature length tensor (B,). - sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1). - sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1). - spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim). - spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim). - lids (Optional[Tensor]): Language index tensor (B,) or (B, 1). - Returns: - Tensor: Generated waveform tensor (B, T_wav). - """ - # encoder - g_src = None - g_tgt = None - if self.spks is not None: - # (B, global_channels, 1) - g_src = self.global_emb( - paddle.reshape(sids_src, [-1])).unsqueeze(-1) - g_tgt = self.global_emb( - paddle.reshape(sids_tgt, [-1])).unsqueeze(-1) - - if self.spk_embed_dim is not None: - # (B, global_channels, 1) - g_src_ = self.spemb_proj( - F.normalize(spembs_src.unsqueeze(0))).unsqueeze(-1) - if g_src is None: - g_src = g_src_ - else: - g_src = g_src + g_src_ - - # (B, global_channels, 1) - g_tgt_ = self.spemb_proj( - F.normalize(spembs_tgt.unsqueeze(0))).unsqueeze(-1) - if g_tgt is None: - g_tgt = g_tgt_ - else: - g_tgt = g_tgt + g_tgt_ - - if self.langs is not None: - # (B, global_channels, 1) - g_ = self.lang_emb(paddle.reshape(lids, [-1])).unsqueeze(-1) - - if g_src is None: - g_src = g_ - else: - g_src = g_src + g_ - - if g_tgt is None: - g_tgt = g_ - else: - g_tgt = g_tgt + g_ - - # forward posterior encoder - z, m_q, logs_q, y_mask = self.posterior_encoder( - feats, feats_lengths, g=g_src) - - # forward flow - # (B, H, T_feats) - z_p = self.flow(z, y_mask, g=g_src) - - # decoder - z_hat = self.flow(z_p, y_mask, g=g_tgt, inverse=True) - wav = self.decoder(z_hat * y_mask, g=g_tgt) - - return wav.squeeze(1) - def _generate_path(self, dur: paddle.Tensor, mask: paddle.Tensor) -> paddle.Tensor: """Generate path a.k.a. monotonic attention. diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py index 983bf0a36..5c476be77 100644 --- a/paddlespeech/t2s/models/vits/vits.py +++ b/paddlespeech/t2s/models/vits/vits.py @@ -381,7 +381,7 @@ class VITS(nn.Layer): if use_teacher_forcing: assert feats is not None feats = feats[None].transpose([0, 2, 1]) - feats_lengths = paddle.to_tensor(paddle.shape(feats)[2]) + feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]]) wav, att_w, dur = self.generator.inference( text=text, text_lengths=text_lengths, @@ -406,43 +406,3 @@ class VITS(nn.Layer): max_len=max_len, ) return dict( wav=paddle.reshape(wav, [-1]), att_w=att_w[0], duration=dur[0]) - - def voice_conversion( - self, - feats: paddle.Tensor, - sids_src: Optional[paddle.Tensor]=None, - sids_tgt: Optional[paddle.Tensor]=None, - spembs_src: Optional[paddle.Tensor]=None, - spembs_tgt: Optional[paddle.Tensor]=None, - lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor: - """Run voice conversion. - Args: - feats (Tensor): Feature tensor (T_feats, aux_channels). - sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,). - sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,). - spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,). - spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,). - lids (Optional[Tensor]): Language index tensor (1,). - Returns: - Dict[str, Tensor]: - * wav (Tensor): Generated waveform tensor (T_wav,). - """ - assert feats is not None - feats = feats[None].transpose([0, 2, 1]) - feats_lengths = paddle.to_tensor(paddle.shape(feats)[2]) - - sids_none = sids_src is None and sids_tgt is None - spembs_none = spembs_src is None and spembs_tgt is None - - assert not sids_none or not spembs_none - - wav = self.generator.voice_conversion( - feats, - feats_lengths, - sids_src, - sids_tgt, - spembs_src, - spembs_tgt, - lids, ) - - return dict(wav=paddle.reshape(wav, [-1])) diff --git a/paddlespeech/t2s/models/vits/vits_updater.py b/paddlespeech/t2s/models/vits/vits_updater.py index 9f8be6803..76271fd97 100644 --- a/paddlespeech/t2s/models/vits/vits_updater.py +++ b/paddlespeech/t2s/models/vits/vits_updater.py @@ -111,8 +111,6 @@ class VITSUpdater(StandardUpdater): text_lengths=batch["text_lengths"], feats=batch["feats"], feats_lengths=batch["feats_lengths"], - sids=batch.get("spk_id", None), - spembs=batch.get("spk_emb", None), forward_generator=turn == "generator") # Generator if turn == "generator": @@ -270,8 +268,6 @@ class VITSEvaluator(StandardEvaluator): text_lengths=batch["text_lengths"], feats=batch["feats"], feats_lengths=batch["feats_lengths"], - sids=batch.get("spk_id", None), - spembs=batch.get("spk_emb", None), forward_generator=turn == "generator") # Generator if turn == "generator": diff --git a/paddlespeech/t2s/training/updaters/standard_updater.py b/paddlespeech/t2s/training/updaters/standard_updater.py index 6d3aa7099..668d2fc69 100644 --- a/paddlespeech/t2s/training/updaters/standard_updater.py +++ b/paddlespeech/t2s/training/updaters/standard_updater.py @@ -24,11 +24,10 @@ from paddle.nn import Layer from paddle.optimizer import Optimizer from timer import timer -from paddlespeech.t2s.datasets.sampler import ErnieSATSampler from paddlespeech.t2s.training.reporter import report from paddlespeech.t2s.training.updater import UpdaterBase from paddlespeech.t2s.training.updater import UpdaterState - +from paddlespeech.t2s.datasets.sampler import ErnieSATSampler class StandardUpdater(UpdaterBase): """An example of over-simplification. Things may not be that simple, but