diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index b781c4a8e..d52b0dca7 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -14,3 +14,9 @@
import _locale
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+
+
+
+
+
diff --git a/paddlespeech/audio/__init__.py b/paddlespeech/audio/__init__.py
index a91958105..83be8e32e 100644
--- a/paddlespeech/audio/__init__.py
+++ b/paddlespeech/audio/__init__.py
@@ -14,12 +14,12 @@
from . import compliance
from . import datasets
from . import features
+from . import text
+from . import transform
+from . import streamdata
from . import functional
from . import io
from . import metric
from . import sox_effects
-from . import streamdata
-from . import text
-from . import transform
from .backends import load
from .backends import save
diff --git a/paddlespeech/audio/streamdata/__init__.py b/paddlespeech/audio/streamdata/__init__.py
index 47a2e79b3..753fcc11b 100644
--- a/paddlespeech/audio/streamdata/__init__.py
+++ b/paddlespeech/audio/streamdata/__init__.py
@@ -4,66 +4,67 @@
# Modified from https://github.com/webdataset/webdataset
#
# flake8: noqa
-from .cache import cached_tarfile_samples
-from .cache import cached_tarfile_to_samples
-from .cache import lru_cleanup
-from .cache import pipe_cleaner
-from .compat import FluidWrapper
-from .compat import WebDataset
-from .compat import WebLoader
-from .extradatasets import MockDataset
-from .extradatasets import with_epoch
-from .extradatasets import with_length
-from .filters import associate
-from .filters import audio_cmvn
-from .filters import audio_compute_fbank
-from .filters import audio_data_filter
-from .filters import audio_padding
-from .filters import audio_resample
-from .filters import audio_spec_aug
-from .filters import audio_tokenize
-from .filters import batched
-from .filters import decode
-from .filters import detshuffle
-from .filters import extract_keys
-from .filters import getfirst
-from .filters import info
-from .filters import map
-from .filters import map_dict
-from .filters import map_tuple
-from .filters import pipelinefilter
-from .filters import placeholder
-from .filters import rename
-from .filters import rename_keys
-from .filters import select
-from .filters import shuffle
-from .filters import slice
-from .filters import sort
-from .filters import to_tuple
-from .filters import transform_with
-from .filters import unbatched
-from .filters import xdecode
-from .handlers import ignore_and_continue
-from .handlers import ignore_and_stop
-from .handlers import reraise_exception
-from .handlers import warn_and_continue
-from .handlers import warn_and_stop
-from .mix import RandomMix
-from .mix import RoundRobin
+
+from .cache import (
+ cached_tarfile_samples,
+ cached_tarfile_to_samples,
+ lru_cleanup,
+ pipe_cleaner,
+)
+from .compat import WebDataset, WebLoader, FluidWrapper
+from .extradatasets import MockDataset, with_epoch, with_length
+from .filters import (
+ associate,
+ batched,
+ decode,
+ detshuffle,
+ extract_keys,
+ getfirst,
+ info,
+ map,
+ map_dict,
+ map_tuple,
+ pipelinefilter,
+ rename,
+ rename_keys,
+ audio_resample,
+ select,
+ shuffle,
+ slice,
+ to_tuple,
+ transform_with,
+ unbatched,
+ xdecode,
+ audio_data_filter,
+ audio_tokenize,
+ audio_resample,
+ audio_compute_fbank,
+ audio_spec_aug,
+ sort,
+ audio_padding,
+ audio_cmvn,
+ placeholder,
+)
+from .handlers import (
+ ignore_and_continue,
+ ignore_and_stop,
+ reraise_exception,
+ warn_and_continue,
+ warn_and_stop,
+)
from .pipeline import DataPipeline
-from .shardlists import MultiShardSample
-from .shardlists import non_empty
-from .shardlists import resampled
-from .shardlists import ResampledShards
-from .shardlists import shardspec
-from .shardlists import SimpleShardList
-from .shardlists import single_node_only
-from .shardlists import split_by_node
-from .shardlists import split_by_worker
-from .tariterators import tarfile_samples
-from .tariterators import tarfile_to_samples
-from .utils import PipelineStage
-from .utils import repeatedly
-from .writer import numpy_dumps
-from .writer import ShardWriter
-from .writer import TarWriter
+from .shardlists import (
+ MultiShardSample,
+ ResampledShards,
+ SimpleShardList,
+ non_empty,
+ resampled,
+ shardspec,
+ single_node_only,
+ split_by_node,
+ split_by_worker,
+)
+from .tariterators import tarfile_samples, tarfile_to_samples
+from .utils import PipelineStage, repeatedly
+from .writer import ShardWriter, TarWriter, numpy_dumps
+from .mix import RandomMix, RoundRobin
diff --git a/paddlespeech/audio/streamdata/autodecode.py b/paddlespeech/audio/streamdata/autodecode.py
index d7f7937bd..ca0e2ea2f 100644
--- a/paddlespeech/audio/streamdata/autodecode.py
+++ b/paddlespeech/audio/streamdata/autodecode.py
@@ -5,19 +5,18 @@
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
#
+
"""Automatically decode webdataset samples."""
-import io
-import json
-import os
-import pickle
-import re
-import tempfile
+
+import io, json, os, pickle, re, tempfile
from functools import partial
import numpy as np
+
"""Extensions passed on to the image decoder."""
image_extensions = "jpg jpeg png ppm pgm pbm pnm".split()
+
################################################################
# handle basic datatypes
################################################################
@@ -129,7 +128,7 @@ def call_extension_handler(key, data, f, extensions):
target = target.split(".")
if len(target) > len(extension):
continue
- if extension[-len(target):] == target:
+ if extension[-len(target) :] == target:
return f(data)
return None
@@ -269,6 +268,7 @@ def imagehandler(imagespec, extensions=image_extensions):
################################################################
# torch video
################################################################
+
'''
def torch_video(key, data):
"""Decode video using the torchvideo library.
@@ -289,6 +289,7 @@ def torch_video(key, data):
return torchvision.io.read_video(fname, pts_unit="sec")
'''
+
################################################################
# paddlespeech.audio
################################################################
@@ -358,6 +359,7 @@ def gzfilter(key, data):
# decode entire training amples
################################################################
+
default_pre_handlers = [gzfilter]
default_post_handlers = [basichandlers]
@@ -385,8 +387,7 @@ class Decoder:
pre = default_pre_handlers
if post is None:
post = default_post_handlers
- assert all(callable(h)
- for h in handlers), f"one of {handlers} not callable"
+ assert all(callable(h) for h in handlers), f"one of {handlers} not callable"
assert all(callable(h) for h in pre), f"one of {pre} not callable"
assert all(callable(h) for h in post), f"one of {post} not callable"
self.handlers = pre + handlers + post
diff --git a/paddlespeech/audio/streamdata/cache.py b/paddlespeech/audio/streamdata/cache.py
index faa196398..e7bbffa1b 100644
--- a/paddlespeech/audio/streamdata/cache.py
+++ b/paddlespeech/audio/streamdata/cache.py
@@ -2,10 +2,7 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
-import os
-import random
-import re
-import sys
+import itertools, os, random, re, sys
from urllib.parse import urlparse
from . import filters
@@ -43,7 +40,7 @@ def lru_cleanup(cache_dir, cache_size, keyfn=os.path.getctime, verbose=False):
os.remove(fname)
-def download(url, dest, chunk_size=1024**2, verbose=False):
+def download(url, dest, chunk_size=1024 ** 2, verbose=False):
"""Download a file from `url` to `dest`."""
temp = dest + f".temp{os.getpid()}"
with gopen.gopen(url) as stream:
@@ -68,11 +65,12 @@ def pipe_cleaner(spec):
def get_file_cached(
- spec,
- cache_size=-1,
- cache_dir=None,
- url_to_name=pipe_cleaner,
- verbose=False, ):
+ spec,
+ cache_size=-1,
+ cache_dir=None,
+ url_to_name=pipe_cleaner,
+ verbose=False,
+):
if cache_size == -1:
cache_size = default_cache_size
if cache_dir is None:
@@ -109,14 +107,15 @@ verbose_cache = int(os.environ.get("WDS_VERBOSE_CACHE", "0"))
def cached_url_opener(
- data,
- handler=reraise_exception,
- cache_size=-1,
- cache_dir=None,
- url_to_name=pipe_cleaner,
- validator=check_tar_format,
- verbose=False,
- always=False, ):
+ data,
+ handler=reraise_exception,
+ cache_size=-1,
+ cache_dir=None,
+ url_to_name=pipe_cleaner,
+ validator=check_tar_format,
+ verbose=False,
+ always=False,
+):
"""Given a stream of url names (packaged in `dict(url=url)`), yield opened streams."""
verbose = verbose or verbose_cache
for sample in data:
@@ -133,7 +132,8 @@ def cached_url_opener(
cache_size=cache_size,
cache_dir=cache_dir,
url_to_name=url_to_name,
- verbose=verbose, )
+ verbose=verbose,
+ )
if verbose:
print("# opening %s" % dest, file=sys.stderr)
assert os.path.exists(dest)
@@ -143,8 +143,9 @@ def cached_url_opener(
data = f.read(200)
os.remove(dest)
raise ValueError(
- "%s (%s) is not a tar archive, but a %s, contains %s" %
- (dest, url, ftype, repr(data)))
+ "%s (%s) is not a tar archive, but a %s, contains %s"
+ % (dest, url, ftype, repr(data))
+ )
try:
stream = open(dest, "rb")
sample.update(stream=stream)
@@ -157,7 +158,7 @@ def cached_url_opener(
continue
raise exn
except Exception as exn:
- exn.args = exn.args + (url, )
+ exn.args = exn.args + (url,)
if handler(exn):
continue
else:
@@ -165,13 +166,14 @@ def cached_url_opener(
def cached_tarfile_samples(
- src,
- handler=reraise_exception,
- cache_size=-1,
- cache_dir=None,
- verbose=False,
- url_to_name=pipe_cleaner,
- always=False, ):
+ src,
+ handler=reraise_exception,
+ cache_size=-1,
+ cache_dir=None,
+ verbose=False,
+ url_to_name=pipe_cleaner,
+ always=False,
+):
streams = cached_url_opener(
src,
handler=handler,
@@ -179,7 +181,8 @@ def cached_tarfile_samples(
cache_dir=cache_dir,
verbose=verbose,
url_to_name=url_to_name,
- always=always, )
+ always=always,
+ )
samples = tar_file_and_group_expander(streams, handler=handler)
return samples
diff --git a/paddlespeech/audio/streamdata/compat.py b/paddlespeech/audio/streamdata/compat.py
index 9012eeb10..deda53384 100644
--- a/paddlespeech/audio/streamdata/compat.py
+++ b/paddlespeech/audio/streamdata/compat.py
@@ -2,17 +2,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
-import yaml
+from dataclasses import dataclass
+from itertools import islice
+from typing import List
+
+import braceexpand, yaml
from . import autodecode
-from . import cache
-from . import filters
-from . import shardlists
-from . import tariterators
+from . import cache, filters, shardlists, tariterators
from .filters import reraise_exception
-from .paddle_utils import DataLoader
-from .paddle_utils import IterableDataset
from .pipeline import DataPipeline
+from .paddle_utils import DataLoader, IterableDataset
class FluidInterface:
@@ -26,8 +26,7 @@ class FluidInterface:
return self.compose(filters.unbatched())
def listed(self, batchsize, partial=True):
- return self.compose(
- filters.batched(), batchsize=batchsize, collation_fn=None)
+ return self.compose(filters.batched(), batchsize=batchsize, collation_fn=None)
def unlisted(self):
return self.compose(filters.unlisted())
@@ -44,19 +43,9 @@ class FluidInterface:
def map(self, f, handler=reraise_exception):
return self.compose(filters.map(f, handler=handler))
- def decode(self,
- *args,
- pre=None,
- post=None,
- only=None,
- partial=False,
- handler=reraise_exception):
- handlers = [
- autodecode.ImageHandler(x) if isinstance(x, str) else x
- for x in args
- ]
- decoder = autodecode.Decoder(
- handlers, pre=pre, post=post, only=only, partial=partial)
+ def decode(self, *args, pre=None, post=None, only=None, partial=False, handler=reraise_exception):
+ handlers = [autodecode.ImageHandler(x) if isinstance(x, str) else x for x in args]
+ decoder = autodecode.Decoder(handlers, pre=pre, post=post, only=only, partial=partial)
return self.map(decoder, handler=handler)
def map_dict(self, handler=reraise_exception, **kw):
@@ -91,12 +80,12 @@ class FluidInterface:
def audio_data_filter(self, *args, **kw):
return self.compose(filters.audio_data_filter(*args, **kw))
-
+
def audio_tokenize(self, *args, **kw):
return self.compose(filters.audio_tokenize(*args, **kw))
def resample(self, *args, **kw):
- return self.compose(filters.resample(*args, **kw))
+ return self.compose(filters.resample(*args, **kw))
def audio_compute_fbank(self, *args, **kw):
return self.compose(filters.audio_compute_fbank(*args, **kw))
@@ -113,28 +102,27 @@ class FluidInterface:
def audio_cmvn(self, cmvn_file):
return self.compose(filters.audio_cmvn(cmvn_file))
-
class WebDataset(DataPipeline, FluidInterface):
"""Small fluid-interface wrapper for DataPipeline."""
def __init__(
- self,
- urls,
- handler=reraise_exception,
- resampled=False,
- repeat=False,
- shardshuffle=None,
- cache_size=0,
- cache_dir=None,
- detshuffle=False,
- nodesplitter=shardlists.single_node_only,
- verbose=False, ):
+ self,
+ urls,
+ handler=reraise_exception,
+ resampled=False,
+ repeat=False,
+ shardshuffle=None,
+ cache_size=0,
+ cache_dir=None,
+ detshuffle=False,
+ nodesplitter=shardlists.single_node_only,
+ verbose=False,
+ ):
super().__init__()
if isinstance(urls, IterableDataset):
assert not resampled
self.append(urls)
- elif isinstance(urls, str) and (urls.endswith(".yaml") or
- urls.endswith(".yml")):
+ elif isinstance(urls, str) and (urls.endswith(".yaml") or urls.endswith(".yml")):
with (open(urls)) as stream:
spec = yaml.safe_load(stream)
assert "datasets" in spec
@@ -164,7 +152,9 @@ class WebDataset(DataPipeline, FluidInterface):
handler=handler,
verbose=verbose,
cache_size=cache_size,
- cache_dir=cache_dir, ))
+ cache_dir=cache_dir,
+ )
+ )
class FluidWrapper(DataPipeline, FluidInterface):
diff --git a/paddlespeech/audio/streamdata/extradatasets.py b/paddlespeech/audio/streamdata/extradatasets.py
index 76361c24a..e6d617724 100644
--- a/paddlespeech/audio/streamdata/extradatasets.py
+++ b/paddlespeech/audio/streamdata/extradatasets.py
@@ -5,10 +5,20 @@
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
#
+
+
"""Train PyTorch models directly from POSIX tar archive.
Code works locally or over HTTP connections.
"""
+
+import itertools as itt
+import os
+import random
+import sys
+
+import braceexpand
+
from . import utils
from .paddle_utils import IterableDataset
from .utils import PipelineStage
@@ -53,7 +63,8 @@ class repeatedly(IterableDataset, PipelineStage):
return utils.repeatedly(
source,
nepochs=self.nepochs,
- nbatches=self.nbatches, )
+ nbatches=self.nbatches,
+ )
class with_epoch(IterableDataset):
diff --git a/paddlespeech/audio/streamdata/filters.py b/paddlespeech/audio/streamdata/filters.py
index 68d6830bb..82b9c6bab 100644
--- a/paddlespeech/audio/streamdata/filters.py
+++ b/paddlespeech/audio/streamdata/filters.py
@@ -3,6 +3,7 @@
# This file is part of the WebDataset library.
# See the LICENSE file for licensing terms (BSD-style).
#
+
# Modified from https://github.com/webdataset/webdataset
# Modified from wenet(https://github.com/wenet-e2e/wenet)
"""A collection of iterators for data transformations.
@@ -11,29 +12,28 @@ These functions are plain iterator functions. You can find curried versions
in webdataset.filters, and you can find IterableDataset wrappers in
webdataset.processing.
"""
+
import io
-import itertools
-import os
-import random
-import re
-import sys
-import time
from fnmatch import fnmatch
-from functools import reduce
+import re
+import itertools, os, random, sys, time
+from functools import reduce, wraps
-import paddle
+import numpy as np
from . import autodecode
-from . import utils
+from . import utils
+from .paddle_utils import PaddleTensor
+from .utils import PipelineStage
+
from .. import backends
from ..compliance import kaldi
+import paddle
from ..transform.cmvn import GlobalCMVN
-from ..transform.spec_augment import freq_mask
-from ..transform.spec_augment import time_mask
-from ..transform.spec_augment import time_warp
from ..utils.tensor_utils import pad_sequence
-from .utils import PipelineStage
-
+from ..transform.spec_augment import time_warp
+from ..transform.spec_augment import time_mask
+from ..transform.spec_augment import freq_mask
class FilterFunction(object):
"""Helper class for currying pipeline stages.
@@ -159,12 +159,10 @@ def transform_with(sample, transformers):
result[i] = f(sample[i])
return result
-
###
# Iterators
###
-
def _info(data, fmt=None, n=3, every=-1, width=50, stream=sys.stderr, name=""):
"""Print information about the samples that are passing through.
@@ -280,16 +278,10 @@ def _log_keys(data, logfile=None):
log_keys = pipelinefilter(_log_keys)
-def _minedecode(x):
- if isinstance(x, str):
- return autodecode.imagehandler(x)
- else:
- return x
-
-
def _decode(data, *args, handler=reraise_exception, **kw):
"""Decode data based on the decoding functions given as arguments."""
- decoder = _minedecode
+
+ decoder = lambda x: autodecode.imagehandler(x) if isinstance(x, str) else x
handlers = [decoder(x) for x in args]
f = autodecode.Decoder(handlers, **kw)
@@ -333,24 +325,15 @@ def _rename(data, handler=reraise_exception, keep=True, **kw):
for sample in data:
try:
if not keep:
- yield {
- k: getfirst(sample, v, missing_is_error=True)
- for k, v in kw.items()
- }
+ yield {k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()}
else:
def listify(v):
return v.split(";") if isinstance(v, str) else v
to_be_replaced = {x for v in kw.values() for x in listify(v)}
- result = {
- k: v
- for k, v in sample.items() if k not in to_be_replaced
- }
- result.update({
- k: getfirst(sample, v, missing_is_error=True)
- for k, v in kw.items()
- })
+ result = {k: v for k, v in sample.items() if k not in to_be_replaced}
+ result.update({k: getfirst(sample, v, missing_is_error=True) for k, v in kw.items()})
yield result
except Exception as exn:
if handler(exn):
@@ -398,11 +381,7 @@ def _map_dict(data, handler=reraise_exception, **kw):
map_dict = pipelinefilter(_map_dict)
-def _to_tuple(data,
- *args,
- handler=reraise_exception,
- missing_is_error=True,
- none_is_error=None):
+def _to_tuple(data, *args, handler=reraise_exception, missing_is_error=True, none_is_error=None):
"""Convert dict samples to tuples."""
if none_is_error is None:
none_is_error = missing_is_error
@@ -411,10 +390,7 @@ def _to_tuple(data,
for sample in data:
try:
- result = tuple([
- getfirst(sample, f, missing_is_error=missing_is_error)
- for f in args
- ])
+ result = tuple([getfirst(sample, f, missing_is_error=missing_is_error) for f in args])
if none_is_error and any(x is None for x in result):
raise ValueError(f"to_tuple {args} got {sample.keys()}")
yield result
@@ -487,28 +463,19 @@ rsample = pipelinefilter(_rsample)
slice = pipelinefilter(itertools.islice)
-def _extract_keys(source,
- *patterns,
- duplicate_is_error=True,
- ignore_missing=False):
+def _extract_keys(source, *patterns, duplicate_is_error=True, ignore_missing=False):
for sample in source:
result = []
for pattern in patterns:
- pattern = pattern.split(";") if isinstance(pattern,
- str) else pattern
- matches = [
- x for x in sample.keys()
- if any(fnmatch("." + x, p) for p in pattern)
- ]
+ pattern = pattern.split(";") if isinstance(pattern, str) else pattern
+ matches = [x for x in sample.keys() if any(fnmatch("." + x, p) for p in pattern)]
if len(matches) == 0:
if ignore_missing:
continue
else:
- raise ValueError(
- f"Cannot find {pattern} in sample keys {sample.keys()}.")
+ raise ValueError(f"Cannot find {pattern} in sample keys {sample.keys()}.")
if len(matches) > 1 and duplicate_is_error:
- raise ValueError(
- f"Multiple sample keys {sample.keys()} match {pattern}.")
+ raise ValueError(f"Multiple sample keys {sample.keys()} match {pattern}.")
value = sample[matches[0]]
result.append(value)
yield tuple(result)
@@ -517,12 +484,7 @@ def _extract_keys(source,
extract_keys = pipelinefilter(_extract_keys)
-def _rename_keys(source,
- *args,
- keep_unselected=False,
- must_match=True,
- duplicate_is_error=True,
- **kw):
+def _rename_keys(source, *args, keep_unselected=False, must_match=True, duplicate_is_error=True, **kw):
renamings = [(pattern, output) for output, pattern in args]
renamings += [(pattern, output) for output, pattern in kw.items()]
for sample in source:
@@ -542,15 +504,11 @@ def _rename_keys(source,
continue
if new_name in new_sample:
if duplicate_is_error:
- raise ValueError(
- f"Duplicate value in sample {sample.keys()} after rename."
- )
+ raise ValueError(f"Duplicate value in sample {sample.keys()} after rename.")
continue
new_sample[new_name] = value
if must_match and not all(matched.values()):
- raise ValueError(
- f"Not all patterns ({matched}) matched sample keys ({sample.keys()})."
- )
+ raise ValueError(f"Not all patterns ({matched}) matched sample keys ({sample.keys()}).")
yield new_sample
@@ -583,18 +541,18 @@ def find_decoder(decoders, path):
if fname.startswith("__"):
return lambda x: x
for pattern, fun in decoders[::-1]:
- if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(),
- pattern):
+ if fnmatch(fname.lower(), pattern) or fnmatch("." + fname.lower(), pattern):
return fun
return None
def _xdecode(
- source,
- *args,
- must_decode=True,
- defaults=default_decoders,
- **kw, ):
+ source,
+ *args,
+ must_decode=True,
+ defaults=default_decoders,
+ **kw,
+):
decoders = list(defaults) + list(args)
decoders += [("*." + k, v) for k, v in kw.items()]
for sample in source:
@@ -617,18 +575,18 @@ def _xdecode(
new_sample[path] = value
yield new_sample
-
xdecode = pipelinefilter(_xdecode)
+
def _audio_data_filter(source,
- frame_shift=10,
- max_length=10240,
- min_length=10,
- token_max_length=200,
- token_min_length=1,
- min_output_input_ratio=0.0005,
- max_output_input_ratio=1):
+ frame_shift=10,
+ max_length=10240,
+ min_length=10,
+ token_max_length=200,
+ token_min_length=1,
+ min_output_input_ratio=0.0005,
+ max_output_input_ratio=1):
""" Filter sample according to feature and label length
Inplace operation.
@@ -655,8 +613,7 @@ def _audio_data_filter(source,
assert 'wav' in sample
assert 'label' in sample
# sample['wav'] is paddle.Tensor, we have 100 frames every second (default)
- num_frames = sample['wav'].shape[1] / sample['sample_rate'] * (
- 1000 / frame_shift)
+ num_frames = sample['wav'].shape[1] / sample['sample_rate'] * (1000 / frame_shift)
if num_frames < min_length:
continue
if num_frames > max_length:
@@ -672,15 +629,13 @@ def _audio_data_filter(source,
continue
yield sample
-
audio_data_filter = pipelinefilter(_audio_data_filter)
-
def _audio_tokenize(source,
- symbol_table,
- bpe_model=None,
- non_lang_syms=None,
- split_with_space=False):
+ symbol_table,
+ bpe_model=None,
+ non_lang_syms=None,
+ split_with_space=False):
""" Decode text to chars or BPE
Inplace operation
@@ -738,10 +693,8 @@ def _audio_tokenize(source,
sample['label'] = label
yield sample
-
audio_tokenize = pipelinefilter(_audio_tokenize)
-
def _audio_resample(source, resample_rate=16000):
""" Resample data.
Inplace operation.
@@ -760,22 +713,18 @@ def _audio_resample(source, resample_rate=16000):
waveform = sample['wav']
if sample_rate != resample_rate:
sample['sample_rate'] = resample_rate
- sample['wav'] = paddle.to_tensor(
- backends.soundfile_backend.resample(
- waveform.numpy(),
- src_sr=sample_rate,
- target_sr=resample_rate))
+ sample['wav'] = paddle.to_tensor(backends.soundfile_backend.resample(
+ waveform.numpy(), src_sr = sample_rate, target_sr = resample_rate
+ ))
yield sample
-
audio_resample = pipelinefilter(_audio_resample)
-
def _audio_compute_fbank(source,
- num_mel_bins=80,
- frame_length=25,
- frame_shift=10,
- dither=0.0):
+ num_mel_bins=80,
+ frame_length=25,
+ frame_shift=10,
+ dither=0.0):
""" Extract fbank
Args:
@@ -797,33 +746,30 @@ def _audio_compute_fbank(source,
waveform = sample['wav']
waveform = waveform * (1 << 15)
# Only keep fname, feat, label
- mat = kaldi.fbank(
- waveform,
- n_mels=num_mel_bins,
- frame_length=frame_length,
- frame_shift=frame_shift,
- dither=dither,
- energy_floor=0.0,
- sr=sample_rate)
+ mat = kaldi.fbank(waveform,
+ n_mels=num_mel_bins,
+ frame_length=frame_length,
+ frame_shift=frame_shift,
+ dither=dither,
+ energy_floor=0.0,
+ sr=sample_rate)
yield dict(fname=sample['fname'], label=sample['label'], feat=mat)
audio_compute_fbank = pipelinefilter(_audio_compute_fbank)
-
-def _audio_spec_aug(
- source,
- max_w=5,
- w_inplace=True,
- w_mode="PIL",
- max_f=30,
- num_f_mask=2,
- f_inplace=True,
- f_replace_with_zero=False,
- max_t=40,
- num_t_mask=2,
- t_inplace=True,
- t_replace_with_zero=False, ):
+def _audio_spec_aug(source,
+ max_w=5,
+ w_inplace=True,
+ w_mode="PIL",
+ max_f=30,
+ num_f_mask=2,
+ f_inplace=True,
+ f_replace_with_zero=False,
+ max_t=40,
+ num_t_mask=2,
+ t_inplace=True,
+ t_replace_with_zero=False,):
""" Do spec augmentation
Inplace operation
@@ -847,23 +793,12 @@ def _audio_spec_aug(
for sample in source:
x = sample['feat']
x = x.numpy()
- x = time_warp(x, max_time_warp=max_w, inplace=w_inplace, mode=w_mode)
- x = freq_mask(
- x,
- F=max_f,
- n_mask=num_f_mask,
- inplace=f_inplace,
- replace_with_zero=f_replace_with_zero)
- x = time_mask(
- x,
- T=max_t,
- n_mask=num_t_mask,
- inplace=t_inplace,
- replace_with_zero=t_replace_with_zero)
+ x = time_warp(x, max_time_warp=max_w, inplace = w_inplace, mode= w_mode)
+ x = freq_mask(x, F = max_f, n_mask = num_f_mask, inplace = f_inplace, replace_with_zero = f_replace_with_zero)
+ x = time_mask(x, T = max_t, n_mask = num_t_mask, inplace = t_inplace, replace_with_zero = t_replace_with_zero)
sample['feat'] = paddle.to_tensor(x, dtype=paddle.float32)
yield sample
-
audio_spec_aug = pipelinefilter(_audio_spec_aug)
@@ -894,10 +829,8 @@ def _sort(source, sort_size=500):
for x in buf:
yield x
-
sort = pipelinefilter(_sort)
-
def _batched(source, batch_size=16):
""" Static batch the data by `batch_size`
@@ -917,10 +850,8 @@ def _batched(source, batch_size=16):
if len(buf) > 0:
yield buf
-
batched = pipelinefilter(_batched)
-
def dynamic_batched(source, max_frames_in_batch=12000):
""" Dynamic batch the data until the total frames in batch
reach `max_frames_in_batch`
@@ -961,8 +892,8 @@ def _audio_padding(source):
"""
for sample in source:
assert isinstance(sample, list)
- feats_length = paddle.to_tensor(
- [x['feat'].shape[0] for x in sample], dtype="int64")
+ feats_length = paddle.to_tensor([x['feat'].shape[0] for x in sample],
+ dtype="int64")
order = paddle.argsort(feats_length, descending=True)
feats_lengths = paddle.to_tensor(
[sample[i]['feat'].shape[0] for i in order], dtype="int64")
@@ -971,20 +902,20 @@ def _audio_padding(source):
sorted_labels = [
paddle.to_tensor(sample[i]['label'], dtype="int32") for i in order
]
- label_lengths = paddle.to_tensor(
- [x.shape[0] for x in sorted_labels], dtype="int64")
- padded_feats = pad_sequence(
- sorted_feats, batch_first=True, padding_value=0)
- padding_labels = pad_sequence(
- sorted_labels, batch_first=True, padding_value=-1)
-
- yield (sorted_keys, padded_feats, feats_lengths, padding_labels,
+ label_lengths = paddle.to_tensor([x.shape[0] for x in sorted_labels],
+ dtype="int64")
+ padded_feats = pad_sequence(sorted_feats,
+ batch_first=True,
+ padding_value=0)
+ padding_labels = pad_sequence(sorted_labels,
+ batch_first=True,
+ padding_value=-1)
+
+ yield (sorted_keys, padded_feats, feats_lengths, padding_labels,
label_lengths)
-
audio_padding = pipelinefilter(_audio_padding)
-
def _audio_cmvn(source, cmvn_file):
global_cmvn = GlobalCMVN(cmvn_file)
for batch in source:
@@ -992,16 +923,13 @@ def _audio_cmvn(source, cmvn_file):
padded_feats = padded_feats.numpy()
padded_feats = global_cmvn(padded_feats)
padded_feats = paddle.to_tensor(padded_feats, dtype=paddle.float32)
- yield (sorted_keys, padded_feats, feats_lengths, padding_labels,
- label_lengths)
-
+ yield (sorted_keys, padded_feats, feats_lengths, padding_labels,
+ label_lengths)
audio_cmvn = pipelinefilter(_audio_cmvn)
-
def _placeholder(source):
for data in source:
yield data
-
placeholder = pipelinefilter(_placeholder)
diff --git a/paddlespeech/audio/streamdata/gopen.py b/paddlespeech/audio/streamdata/gopen.py
index 60a434603..457d048a6 100644
--- a/paddlespeech/audio/streamdata/gopen.py
+++ b/paddlespeech/audio/streamdata/gopen.py
@@ -3,12 +3,12 @@
# This file is part of the WebDataset library.
# See the LICENSE file for licensing terms (BSD-style).
#
+
+
"""Open URLs by calling subcommands."""
-import os
-import re
-import sys
-from subprocess import PIPE
-from subprocess import Popen
+
+import os, sys, re
+from subprocess import PIPE, Popen
from urllib.parse import urlparse
# global used for printing additional node information during verbose output
@@ -31,13 +31,14 @@ class Pipe:
"""
def __init__(
- self,
- *args,
- mode=None,
- timeout=7200.0,
- ignore_errors=False,
- ignore_status=[],
- **kw, ):
+ self,
+ *args,
+ mode=None,
+ timeout=7200.0,
+ ignore_errors=False,
+ ignore_status=[],
+ **kw,
+ ):
"""Create an IO Pipe."""
self.ignore_errors = ignore_errors
self.ignore_status = [0] + ignore_status
@@ -74,7 +75,8 @@ class Pipe:
if verbose:
print(
f"pipe exit [{self.status} {os.getpid()}:{self.proc.pid}] {self.args} {info}",
- file=sys.stderr, )
+ file=sys.stderr,
+ )
if self.status not in self.ignore_status and not self.ignore_errors:
raise Exception(f"{self.args}: exit {self.status} (read) {info}")
@@ -112,11 +114,9 @@ class Pipe:
self.close()
-def set_options(obj,
- timeout=None,
- ignore_errors=None,
- ignore_status=None,
- handler=None):
+def set_options(
+ obj, timeout=None, ignore_errors=None, ignore_status=None, handler=None
+):
"""Set options for Pipes.
This function can be called on any stream. It will set pipe options only
@@ -168,14 +168,16 @@ def gopen_pipe(url, mode="rb", bufsize=8192):
mode=mode,
shell=True,
bufsize=bufsize,
- ignore_status=[141], ) # skipcq: BAN-B604
+ ignore_status=[141],
+ ) # skipcq: BAN-B604
elif mode[0] == "w":
return Pipe(
cmd,
mode=mode,
shell=True,
bufsize=bufsize,
- ignore_status=[141], ) # skipcq: BAN-B604
+ ignore_status=[141],
+ ) # skipcq: BAN-B604
else:
raise ValueError(f"{mode}: unknown mode")
@@ -194,7 +196,8 @@ def gopen_curl(url, mode="rb", bufsize=8192):
mode=mode,
shell=True,
bufsize=bufsize,
- ignore_status=[141, 23], ) # skipcq: BAN-B604
+ ignore_status=[141, 23],
+ ) # skipcq: BAN-B604
elif mode[0] == "w":
cmd = f"curl -s -L -T - '{url}'"
return Pipe(
@@ -202,7 +205,8 @@ def gopen_curl(url, mode="rb", bufsize=8192):
mode=mode,
shell=True,
bufsize=bufsize,
- ignore_status=[141, 26], ) # skipcq: BAN-B604
+ ignore_status=[141, 26],
+ ) # skipcq: BAN-B604
else:
raise ValueError(f"{mode}: unknown mode")
@@ -222,13 +226,15 @@ def gopen_htgs(url, mode="rb", bufsize=8192):
mode=mode,
shell=True,
bufsize=bufsize,
- ignore_status=[141, 23], ) # skipcq: BAN-B604
+ ignore_status=[141, 23],
+ ) # skipcq: BAN-B604
elif mode[0] == "w":
raise ValueError(f"{mode}: cannot write")
else:
raise ValueError(f"{mode}: unknown mode")
+
def gopen_gsutil(url, mode="rb", bufsize=8192):
"""Open a URL with `curl`.
@@ -243,7 +249,8 @@ def gopen_gsutil(url, mode="rb", bufsize=8192):
mode=mode,
shell=True,
bufsize=bufsize,
- ignore_status=[141, 23], ) # skipcq: BAN-B604
+ ignore_status=[141, 23],
+ ) # skipcq: BAN-B604
elif mode[0] == "w":
cmd = f"gsutil cp - '{url}'"
return Pipe(
@@ -251,11 +258,13 @@ def gopen_gsutil(url, mode="rb", bufsize=8192):
mode=mode,
shell=True,
bufsize=bufsize,
- ignore_status=[141, 26], ) # skipcq: BAN-B604
+ ignore_status=[141, 26],
+ ) # skipcq: BAN-B604
else:
raise ValueError(f"{mode}: unknown mode")
+
def gopen_error(url, *args, **kw):
"""Raise a value error.
@@ -276,7 +285,8 @@ gopen_schemes = dict(
ftps=gopen_curl,
scp=gopen_curl,
gs=gopen_gsutil,
- htgs=gopen_htgs, )
+ htgs=gopen_htgs,
+)
def gopen(url, mode="rb", bufsize=8192, **kw):
diff --git a/paddlespeech/audio/streamdata/handlers.py b/paddlespeech/audio/streamdata/handlers.py
index 0173e5373..7f3d28b62 100644
--- a/paddlespeech/audio/streamdata/handlers.py
+++ b/paddlespeech/audio/streamdata/handlers.py
@@ -3,6 +3,7 @@
# This file is part of the WebDataset library.
# See the LICENSE file for licensing terms (BSD-style).
#
+
"""Pluggable exception handlers.
These are functions that take an exception as an argument and then return...
@@ -13,8 +14,8 @@ These are functions that take an exception as an argument and then return...
They are used as handler= arguments in much of the library.
"""
-import time
-import warnings
+
+import time, warnings
def reraise_exception(exn):
diff --git a/paddlespeech/audio/streamdata/mix.py b/paddlespeech/audio/streamdata/mix.py
index 37556ed94..7d790f00f 100644
--- a/paddlespeech/audio/streamdata/mix.py
+++ b/paddlespeech/audio/streamdata/mix.py
@@ -5,12 +5,17 @@
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
#
+
"""Classes for mixing samples from multiple sources."""
-import random
+
+import itertools, os, random, time, sys
+from functools import reduce, wraps
import numpy as np
-from .paddle_utils import IterableDataset
+from . import autodecode, utils
+from .paddle_utils import PaddleTensor, IterableDataset
+from .utils import PipelineStage
def round_robin_shortest(*sources):
diff --git a/paddlespeech/audio/streamdata/paddle_utils.py b/paddlespeech/audio/streamdata/paddle_utils.py
index c2ad8756b..02bc4c841 100644
--- a/paddlespeech/audio/streamdata/paddle_utils.py
+++ b/paddlespeech/audio/streamdata/paddle_utils.py
@@ -5,11 +5,12 @@
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
#
+
"""Mock implementations of paddle interfaces when paddle is not available."""
+
try:
- from paddle.io import DataLoader
- from paddle.io import IterableDataset
+ from paddle.io import DataLoader, IterableDataset
except ModuleNotFoundError:
class IterableDataset:
@@ -21,3 +22,12 @@ except ModuleNotFoundError:
"""Empty implementation of DataLoader when paddle is not available."""
pass
+
+try:
+ from paddle import Tensor as PaddleTensor
+except ModuleNotFoundError:
+
+ class TorchTensor:
+ """Empty implementation of PaddleTensor when paddle is not available."""
+
+ pass
diff --git a/paddlespeech/audio/streamdata/pipeline.py b/paddlespeech/audio/streamdata/pipeline.py
index ff16760ae..7339a762a 100644
--- a/paddlespeech/audio/streamdata/pipeline.py
+++ b/paddlespeech/audio/streamdata/pipeline.py
@@ -3,12 +3,15 @@
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
#%%
-import copy
-import sys
+import copy, os, random, sys, time
+from dataclasses import dataclass
from itertools import islice
+from typing import List
-from .paddle_utils import DataLoader
-from .paddle_utils import IterableDataset
+import braceexpand, yaml
+
+from .handlers import reraise_exception
+from .paddle_utils import DataLoader, IterableDataset
from .utils import PipelineStage
@@ -19,7 +22,8 @@ def add_length_method(obj):
Combined = type(
obj.__class__.__name__ + "_Length",
(obj.__class__, IterableDataset),
- {"__len__": length}, )
+ {"__len__": length},
+ )
obj.__class__ = Combined
return obj
diff --git a/paddlespeech/audio/streamdata/shardlists.py b/paddlespeech/audio/streamdata/shardlists.py
index 54f501052..cfaf9a64b 100644
--- a/paddlespeech/audio/streamdata/shardlists.py
+++ b/paddlespeech/audio/streamdata/shardlists.py
@@ -4,30 +4,28 @@
# This file is part of the WebDataset library.
# See the LICENSE file for licensing terms (BSD-style).
#
+
# Modified from https://github.com/webdataset/webdataset
+
"""Train PyTorch models directly from POSIX tar archive.
Code works locally or over HTTP connections.
"""
-import os
-import random
-import sys
-import time
-from dataclasses import dataclass
-from dataclasses import field
+
+import os, random, sys, time
+from dataclasses import dataclass, field
from itertools import islice
from typing import List
-import braceexpand
-import yaml
+import braceexpand, yaml
from . import utils
-from ..utils.log import Logger
from .filters import pipelinefilter
from .paddle_utils import IterableDataset
-logger = Logger(__name__)
+from ..utils.log import Logger
+logger = Logger(__name__)
def expand_urls(urls):
if isinstance(urls, str):
urllist = urls.split("::")
@@ -66,8 +64,7 @@ class SimpleShardList(IterableDataset):
def split_by_node(src, group=None):
- rank, world_size, worker, num_workers = utils.paddle_worker_info(
- group=group)
+ rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group)
logger.info(f"world_size:{world_size}, rank:{rank}")
if world_size > 1:
for s in islice(src, rank, None, world_size):
@@ -78,11 +75,9 @@ def split_by_node(src, group=None):
def single_node_only(src, group=None):
- rank, world_size, worker, num_workers = utils.paddle_worker_info(
- group=group)
+ rank, world_size, worker, num_workers = utils.paddle_worker_info(group=group)
if world_size > 1:
- raise ValueError(
- "input pipeline needs to be reconfigured for multinode training")
+ raise ValueError("input pipeline needs to be reconfigured for multinode training")
for s in src:
yield s
@@ -109,8 +104,7 @@ def resampled_(src, n=sys.maxsize):
rng = random.Random(seed)
print("# resampled loading", file=sys.stderr)
items = list(src)
- print(
- f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr)
+ print(f"# resampled got {len(items)} samples, yielding {n}", file=sys.stderr)
for i in range(n):
yield rng.choice(items)
@@ -124,9 +118,7 @@ def non_empty(src):
yield s
count += 1
if count == 0:
- raise ValueError(
- "pipeline stage received no data at all and this was declared as an error"
- )
+ raise ValueError("pipeline stage received no data at all and this was declared as an error")
@dataclass
@@ -146,6 +138,10 @@ def expand(s):
return os.path.expanduser(os.path.expandvars(s))
+class MultiShardSample(IterableDataset):
+ def __init__(self, fname):
+ """Construct a shardlist from multiple sources using a YAML spec."""
+ self.epoch = -1
class MultiShardSample(IterableDataset):
def __init__(self, fname):
"""Construct a shardlist from multiple sources using a YAML spec."""
@@ -160,23 +156,20 @@ class MultiShardSample(IterableDataset):
else:
with open(fname) as stream:
spec = yaml.safe_load(stream)
- assert set(spec.keys()).issubset(
- set("prefix datasets buckets".split())), list(spec.keys())
+ assert set(spec.keys()).issubset(set("prefix datasets buckets".split())), list(spec.keys())
prefix = expand(spec.get("prefix", ""))
self.sources = []
for ds in spec["datasets"]:
- assert set(ds.keys()).issubset(
- set("buckets name shards resample choose".split())), list(
- ds.keys())
+ assert set(ds.keys()).issubset(set("buckets name shards resample choose".split())), list(
+ ds.keys()
+ )
buckets = ds.get("buckets", spec.get("buckets", []))
if isinstance(buckets, str):
buckets = [buckets]
buckets = [expand(s) for s in buckets]
if buckets == []:
buckets = [""]
- assert len(
- buckets
- ) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented"
+ assert len(buckets) == 1, f"{buckets}: FIXME support for multiple buckets unimplemented"
bucket = buckets[0]
name = ds.get("name", "@" + bucket)
urls = ds["shards"]
@@ -184,19 +177,15 @@ class MultiShardSample(IterableDataset):
urls = [urls]
# urls = [u for url in urls for u in braceexpand.braceexpand(url)]
urls = [
- prefix + os.path.join(bucket, u)
- for url in urls for u in braceexpand.braceexpand(expand(url))
+ prefix + os.path.join(bucket, u) for url in urls for u in braceexpand.braceexpand(expand(url))
]
resample = ds.get("resample", -1)
nsample = ds.get("choose", -1)
if nsample > len(urls):
- raise ValueError(
- f"perepoch {nsample} must be no greater than the number of shards"
- )
+ raise ValueError(f"perepoch {nsample} must be no greater than the number of shards")
if (nsample > 0) and (resample > 0):
raise ValueError("specify only one of perepoch or choose")
- entry = MSSource(
- name=name, urls=urls, perepoch=nsample, resample=resample)
+ entry = MSSource(name=name, urls=urls, perepoch=nsample, resample=resample)
self.sources.append(entry)
print(f"# {name} {len(urls)} {nsample}", file=sys.stderr)
@@ -214,7 +203,7 @@ class MultiShardSample(IterableDataset):
# sample without replacement
l = list(source.urls)
self.rng.shuffle(l)
- l = l[:source.perepoch]
+ l = l[: source.perepoch]
else:
l = list(source.urls)
result += l
@@ -238,11 +227,12 @@ class ResampledShards(IterableDataset):
"""An iterable dataset yielding a list of urls."""
def __init__(
- self,
- urls,
- nshards=sys.maxsize,
- worker_seed=None,
- deterministic=False, ):
+ self,
+ urls,
+ nshards=sys.maxsize,
+ worker_seed=None,
+ deterministic=False,
+ ):
"""Sample shards from the shard list with replacement.
:param urls: a list of URLs as a Python list or brace notation string
@@ -262,8 +252,7 @@ class ResampledShards(IterableDataset):
if self.deterministic:
seed = utils.make_seed(self.worker_seed(), self.epoch)
else:
- seed = utils.make_seed(self.worker_seed(), self.epoch,
- os.getpid(), time.time_ns(), os.urandom(4))
+ seed = utils.make_seed(self.worker_seed(), self.epoch, os.getpid(), time.time_ns(), os.urandom(4))
if os.environ.get("WDS_SHOW_SEED", "0") == "1":
print(f"# ResampledShards seed {seed}")
self.rng = random.Random(seed)
diff --git a/paddlespeech/audio/streamdata/tariterators.py b/paddlespeech/audio/streamdata/tariterators.py
index 79b81c0ce..b1616918c 100644
--- a/paddlespeech/audio/streamdata/tariterators.py
+++ b/paddlespeech/audio/streamdata/tariterators.py
@@ -3,12 +3,13 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# This file is part of the WebDataset library.
# See the LICENSE file for licensing terms (BSD-style).
+
# Modified from https://github.com/webdataset/webdataset
# Modified from wenet(https://github.com/wenet-e2e/wenet)
+
"""Low level iteration functions for tar archives."""
-import random
-import re
-import tarfile
+
+import random, re, tarfile
import braceexpand
@@ -26,7 +27,6 @@ import numpy as np
AUDIO_FORMAT_SETS = set(['flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'])
-
def base_plus_ext(path):
"""Split off all file extensions.
@@ -47,8 +47,12 @@ def valid_sample(sample):
:param sample: sample to be checked
"""
- return (sample is not None and isinstance(sample, dict) and
- len(list(sample.keys())) > 0 and not sample.get("__bad__", False))
+ return (
+ sample is not None
+ and isinstance(sample, dict)
+ and len(list(sample.keys())) > 0
+ and not sample.get("__bad__", False)
+ )
# FIXME: UNUSED
@@ -75,16 +79,16 @@ def url_opener(data, handler=reraise_exception, **kw):
sample.update(stream=stream)
yield sample
except Exception as exn:
- exn.args = exn.args + (url, )
+ exn.args = exn.args + (url,)
if handler(exn):
continue
else:
break
-def tar_file_iterator(fileobj,
- skip_meta=r"__[^/]*__($|/)",
- handler=reraise_exception):
+def tar_file_iterator(
+ fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception
+):
"""Iterate over tar file, yielding filename, content pairs for the given tar stream.
:param fileobj: byte stream suitable for tarfile
@@ -99,8 +103,11 @@ def tar_file_iterator(fileobj,
continue
if fname is None:
continue
- if ("/" not in fname and fname.startswith(meta_prefix) and
- fname.endswith(meta_suffix)):
+ if (
+ "/" not in fname
+ and fname.startswith(meta_prefix)
+ and fname.endswith(meta_suffix)
+ ):
# skipping metadata for now
continue
if skip_meta is not None and re.match(skip_meta, fname):
@@ -111,10 +118,8 @@ def tar_file_iterator(fileobj,
assert pos > 0
prefix, postfix = name[:pos], name[pos + 1:]
if postfix == 'wav':
- waveform, sample_rate = paddlespeech.audio.load(
- stream.extractfile(tarinfo), normal=False)
- result = dict(
- fname=prefix, wav=waveform, sample_rate=sample_rate)
+ waveform, sample_rate = paddlespeech.audio.load(stream.extractfile(tarinfo), normal=False)
+ result = dict(fname=prefix, wav=waveform, sample_rate = sample_rate)
else:
txt = stream.extractfile(tarinfo).read().decode('utf8').strip()
result = dict(fname=prefix, txt=txt)
@@ -123,17 +128,16 @@ def tar_file_iterator(fileobj,
stream.members = []
except Exception as exn:
if hasattr(exn, "args") and len(exn.args) > 0:
- exn.args = (exn.args[0] + " @ " + str(fileobj), ) + exn.args[1:]
+ exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:]
if handler(exn):
continue
else:
break
del stream
-
-def tar_file_and_group_iterator(fileobj,
- skip_meta=r"__[^/]*__($|/)",
- handler=reraise_exception):
+def tar_file_and_group_iterator(
+ fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception
+):
""" Expand a stream of open tar files into a stream of tar file contents.
And groups the file with same prefix
@@ -163,11 +167,8 @@ def tar_file_and_group_iterator(fileobj,
if postfix == 'txt':
example['txt'] = file_obj.read().decode('utf8').strip()
elif postfix in AUDIO_FORMAT_SETS:
- waveform, sample_rate = paddlespeech.audio.load(
- file_obj, normal=False)
- waveform = paddle.to_tensor(
- np.expand_dims(np.array(waveform), 0),
- dtype=paddle.float32)
+ waveform, sample_rate = paddlespeech.audio.load(file_obj, normal=False)
+ waveform = paddle.to_tensor(np.expand_dims(np.array(waveform),0), dtype=paddle.float32)
example['wav'] = waveform
example['sample_rate'] = sample_rate
@@ -175,21 +176,19 @@ def tar_file_and_group_iterator(fileobj,
example[postfix] = file_obj.read()
except Exception as exn:
if hasattr(exn, "args") and len(exn.args) > 0:
- exn.args = (exn.args[0] + " @ " + str(fileobj),
- ) + exn.args[1:]
+ exn.args = (exn.args[0] + " @ " + str(fileobj),) + exn.args[1:]
if handler(exn):
continue
else:
break
valid = False
- # logging.warning('error to parse {}'.format(name))
+ # logging.warning('error to parse {}'.format(name))
prev_prefix = prefix
if prev_prefix is not None:
example['fname'] = prev_prefix
yield example
stream.close()
-
def tar_file_expander(data, handler=reraise_exception):
"""Expand a stream of open tar files into a stream of tar file contents.
@@ -201,8 +200,9 @@ def tar_file_expander(data, handler=reraise_exception):
assert isinstance(source, dict)
assert "stream" in source
for sample in tar_file_iterator(source["stream"]):
- assert (isinstance(sample, dict) and "data" in sample and
- "fname" in sample)
+ assert (
+ isinstance(sample, dict) and "data" in sample and "fname" in sample
+ )
sample["__url__"] = url
yield sample
except Exception as exn:
@@ -213,6 +213,8 @@ def tar_file_expander(data, handler=reraise_exception):
break
+
+
def tar_file_and_group_expander(data, handler=reraise_exception):
"""Expand a stream of open tar files into a stream of tar file contents.
@@ -224,8 +226,9 @@ def tar_file_and_group_expander(data, handler=reraise_exception):
assert isinstance(source, dict)
assert "stream" in source
for sample in tar_file_and_group_iterator(source["stream"]):
- assert (isinstance(sample, dict) and "wav" in sample and
- "txt" in sample and "fname" in sample)
+ assert (
+ isinstance(sample, dict) and "wav" in sample and "txt" in sample and "fname" in sample
+ )
sample["__url__"] = url
yield sample
except Exception as exn:
@@ -236,11 +239,7 @@ def tar_file_and_group_expander(data, handler=reraise_exception):
break
-def group_by_keys(data,
- keys=base_plus_ext,
- lcase=True,
- suffixes=None,
- handler=None):
+def group_by_keys(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
"""Return function over iterator that groups key, value pairs into samples.
:param keys: function that splits the key into key and extension (base_plus_ext)
@@ -255,8 +254,8 @@ def group_by_keys(data,
print(
prefix,
suffix,
- current_sample.keys()
- if isinstance(current_sample, dict) else None, )
+ current_sample.keys() if isinstance(current_sample, dict) else None,
+ )
if prefix is None:
continue
if lcase:
diff --git a/paddlespeech/audio/streamdata/utils.py b/paddlespeech/audio/streamdata/utils.py
index 94dab9052..c7294f2bf 100644
--- a/paddlespeech/audio/streamdata/utils.py
+++ b/paddlespeech/audio/streamdata/utils.py
@@ -4,23 +4,22 @@
# This file is part of the WebDataset library.
# See the LICENSE file for licensing terms (BSD-style).
#
+
# Modified from https://github.com/webdataset/webdataset
+
"""Miscellaneous utility functions."""
+
import importlib
import itertools as itt
import os
import re
import sys
-from typing import Any
-from typing import Callable
-from typing import Iterator
-from typing import Union
+from typing import Any, Callable, Iterator, Optional, Union
from ..utils.log import Logger
logger = Logger(__name__)
-
def make_seed(*args):
seed = 0
for arg in args:
@@ -38,7 +37,7 @@ def identity(x: Any) -> Any:
return x
-def safe_eval(s: str, expr: str="{}"):
+def safe_eval(s: str, expr: str = "{}"):
"""Evaluate the given expression more safely."""
if re.sub("[^A-Za-z0-9_]", "", s) != s:
raise ValueError(f"safe_eval: illegal characters in: '{s}'")
@@ -55,9 +54,9 @@ def lookup_sym(sym: str, modules: list):
return None
-def repeatedly0(loader: Iterator,
- nepochs: int=sys.maxsize,
- nbatches: int=sys.maxsize):
+def repeatedly0(
+ loader: Iterator, nepochs: int = sys.maxsize, nbatches: int = sys.maxsize
+):
"""Repeatedly returns batches from a DataLoader."""
for epoch in range(nepochs):
for sample in itt.islice(loader, nbatches):
@@ -70,11 +69,12 @@ def guess_batchsize(batch: Union[tuple, list]):
def repeatedly(
- source: Iterator,
- nepochs: int=None,
- nbatches: int=None,
- nsamples: int=None,
- batchsize: Callable[..., int]=guess_batchsize, ):
+ source: Iterator,
+ nepochs: int = None,
+ nbatches: int = None,
+ nsamples: int = None,
+ batchsize: Callable[..., int] = guess_batchsize,
+):
"""Repeatedly yield samples from an iterator."""
epoch = 0
batch = 0
@@ -93,7 +93,6 @@ def repeatedly(
if nepochs is not None and epoch >= nepochs:
return
-
def paddle_worker_info(group=None):
"""Return node and worker info for PyTorch and some distributed environments."""
rank = 0
@@ -117,7 +116,7 @@ def paddle_worker_info(group=None):
else:
try:
from paddle.io import get_worker_info
- worker_info = get_worker_info()
+ worker_info = paddle.io.get_worker_info()
if worker_info is not None:
worker = worker_info.id
num_workers = worker_info.num_workers
@@ -127,7 +126,6 @@ def paddle_worker_info(group=None):
return rank, world_size, worker, num_workers
-
def paddle_worker_seed(group=None):
"""Compute a distinct, deterministic RNG seed for each worker and node."""
rank, world_size, worker, num_workers = paddle_worker_info(group=group)
diff --git a/paddlespeech/audio/streamdata/writer.py b/paddlespeech/audio/streamdata/writer.py
index 3928a3ba6..7d4f7703b 100644
--- a/paddlespeech/audio/streamdata/writer.py
+++ b/paddlespeech/audio/streamdata/writer.py
@@ -5,24 +5,18 @@
# See the LICENSE file for licensing terms (BSD-style).
# Modified from https://github.com/webdataset/webdataset
#
+
"""Classes and functions for writing tar files and WebDataset files."""
-import io
-import json
-import pickle
-import re
-import tarfile
-import time
-from typing import Any
-from typing import Callable
-from typing import Optional
-from typing import Union
+
+import io, json, pickle, re, tarfile, time
+from typing import Any, Callable, Optional, Union
import numpy as np
from . import gopen
-def imageencoder(image: Any, format: str="PNG"): # skipcq: PYL-W0622
+def imageencoder(image: Any, format: str = "PNG"): # skipcq: PYL-W0622
"""Compress an image using PIL and return it as a string.
Can handle float or uint8 images.
@@ -73,7 +67,6 @@ def bytestr(data: Any):
return data.encode("ascii")
return str(data).encode("ascii")
-
def paddle_dumps(data: Any):
"""Dump data into a bytestring using paddle.dumps.
@@ -89,7 +82,6 @@ def paddle_dumps(data: Any):
paddle.save(data, stream)
return stream.getvalue()
-
def numpy_dumps(data: np.ndarray):
"""Dump data into a bytestring using numpy npy format.
@@ -147,8 +139,9 @@ def add_handlers(d, keys, value):
def make_handlers():
"""Create a list of handlers for encoding data."""
handlers = {}
- add_handlers(handlers, "cls cls2 class count index inx id",
- lambda x: str(x).encode("ascii"))
+ add_handlers(
+ handlers, "cls cls2 class count index inx id", lambda x: str(x).encode("ascii")
+ )
add_handlers(handlers, "txt text transcript", lambda x: x.encode("utf-8"))
add_handlers(handlers, "html htm", lambda x: x.encode("utf-8"))
add_handlers(handlers, "pyd pickle", pickle.dumps)
@@ -159,8 +152,7 @@ def make_handlers():
add_handlers(handlers, "json jsn", lambda x: json.dumps(x).encode("utf-8"))
add_handlers(handlers, "mp msgpack msg", mp_dumps)
add_handlers(handlers, "cbor", cbor_dumps)
- add_handlers(handlers, "jpg jpeg img image",
- lambda data: imageencoder(data, "jpg"))
+ add_handlers(handlers, "jpg jpeg img image", lambda data: imageencoder(data, "jpg"))
add_handlers(handlers, "png", lambda data: imageencoder(data, "png"))
add_handlers(handlers, "pbm", lambda data: imageencoder(data, "pbm"))
add_handlers(handlers, "pgm", lambda data: imageencoder(data, "pgm"))
@@ -200,8 +192,7 @@ def encode_based_on_extension(sample: dict, handlers: dict):
:param handlers: handlers for encoding
"""
return {
- k: encode_based_on_extension1(v, k, handlers)
- for k, v in list(sample.items())
+ k: encode_based_on_extension1(v, k, handlers) for k, v in list(sample.items())
}
@@ -267,14 +258,15 @@ class TarWriter:
"""
def __init__(
- self,
- fileobj,
- user: str="bigdata",
- group: str="bigdata",
- mode: int=0o0444,
- compress: Optional[bool]=None,
- encoder: Union[None, bool, Callable]=True,
- keep_meta: bool=False, ):
+ self,
+ fileobj,
+ user: str = "bigdata",
+ group: str = "bigdata",
+ mode: int = 0o0444,
+ compress: Optional[bool] = None,
+ encoder: Union[None, bool, Callable] = True,
+ keep_meta: bool = False,
+ ):
"""Create a tar writer.
:param fileobj: stream to write data to
@@ -338,7 +330,8 @@ class TarWriter:
continue
if not isinstance(v, (bytes, bytearray, memoryview)):
raise ValueError(
- f"{k} doesn't map to a bytes after encoding ({type(v)})")
+ f"{k} doesn't map to a bytes after encoding ({type(v)})"
+ )
key = obj["__key__"]
for k in sorted(obj.keys()):
if k == "__key__":
@@ -356,8 +349,7 @@ class TarWriter:
ti.uname = self.user
ti.gname = self.group
if not isinstance(v, (bytes, bytearray, memoryview)):
- raise ValueError(
- f"converter didn't yield bytes: {k}, {type(v)}")
+ raise ValueError(f"converter didn't yield bytes: {k}, {type(v)}")
stream = io.BytesIO(v)
self.tarstream.addfile(ti, stream)
total += ti.size
@@ -368,13 +360,14 @@ class ShardWriter:
"""Like TarWriter but splits into multiple shards."""
def __init__(
- self,
- pattern: str,
- maxcount: int=100000,
- maxsize: float=3e9,
- post: Optional[Callable]=None,
- start_shard: int=0,
- **kw, ):
+ self,
+ pattern: str,
+ maxcount: int = 100000,
+ maxsize: float = 3e9,
+ post: Optional[Callable] = None,
+ start_shard: int = 0,
+ **kw,
+ ):
"""Create a ShardWriter.
:param pattern: output file pattern
@@ -407,7 +400,8 @@ class ShardWriter:
self.fname,
self.count,
"%.1f GB" % (self.size / 1e9),
- self.total, )
+ self.total,
+ )
self.shard += 1
stream = open(self.fname, "wb")
self.tarstream = TarWriter(stream, **self.kw)
@@ -419,8 +413,11 @@ class ShardWriter:
:param obj: sample to be written
"""
- if (self.tarstream is None or self.count >= self.maxcount or
- self.size >= self.maxsize):
+ if (
+ self.tarstream is None
+ or self.count >= self.maxcount
+ or self.size >= self.maxsize
+ ):
self.next_stream()
size = self.tarstream.write(obj)
self.count += 1
diff --git a/paddlespeech/audio/text/text_featurizer.py b/paddlespeech/audio/text/text_featurizer.py
index bcd6df54b..91c4d75c3 100644
--- a/paddlespeech/audio/text/text_featurizer.py
+++ b/paddlespeech/audio/text/text_featurizer.py
@@ -17,7 +17,6 @@ from typing import Union
import sentencepiece as spm
-from ..utils.log import Logger
from .utility import BLANK
from .utility import EOS
from .utility import load_dict
@@ -25,6 +24,7 @@ from .utility import MASKCTC
from .utility import SOS
from .utility import SPACE
from .utility import UNK
+from ..utils.log import Logger
logger = Logger(__name__)
diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py
index 0825caec8..8044dc36f 100644
--- a/paddlespeech/audio/transform/perturb.py
+++ b/paddlespeech/audio/transform/perturb.py
@@ -12,16 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
-import io
-import os
-
-import h5py
import librosa
import numpy
-import numpy as np
import scipy
import soundfile
+import io
+import os
+import h5py
+import numpy as np
class SoundHDF5File():
"""Collecting sound files to a HDF5 file
@@ -110,7 +109,6 @@ class SoundHDF5File():
def close(self):
self.file.close()
-
class SpeedPerturbation():
"""SpeedPerturbation
@@ -560,3 +558,4 @@ class RIRConvolve():
[scipy.convolve(x, r, mode="same") for r in rir], axis=-1)
else:
return scipy.convolve(x, rir, mode="same")
+
diff --git a/paddlespeech/audio/transform/spec_augment.py b/paddlespeech/audio/transform/spec_augment.py
index b2635066f..029e7b8f5 100644
--- a/paddlespeech/audio/transform/spec_augment.py
+++ b/paddlespeech/audio/transform/spec_augment.py
@@ -14,7 +14,6 @@
# Modified from espnet(https://github.com/espnet/espnet)
"""Spec Augment module for preprocessing i.e., data augmentation"""
import random
-
import numpy
from PIL import Image
diff --git a/paddlespeech/audio/transform/spectrogram.py b/paddlespeech/audio/transform/spectrogram.py
index 864f3f994..99d50d81e 100644
--- a/paddlespeech/audio/transform/spectrogram.py
+++ b/paddlespeech/audio/transform/spectrogram.py
@@ -381,6 +381,36 @@ class LogMelSpectrogramKaldi():
mat = np.squeeze(mat.numpy())
return mat
+class WavProcess():
+ def __init__(
+ self,
+ dither=0.1):
+ """
+ Args:
+ dither (float): Dithering constant
+
+ Returns:
+ """
+
+ self.dither = dither
+
+ def __call__(self, x, train):
+ """
+ Args:
+ x (np.ndarray): shape (Ti,)
+ train (bool): True, train mode.
+
+ Raises:
+ ValueError: not support (Ti, C)
+
+ Returns:
+ np.ndarray: (T, D)
+ """
+ dither = self.dither if train else 0.0
+ if x.ndim != 1:
+ raise ValueError("Not support x: [Time, Channel]")
+ waveform = np.expand_dims(x, -1)
+ return waveform
class LogMelSpectrogramKaldi_decay():
def __init__(
diff --git a/paddlespeech/audio/transform/transformation.py b/paddlespeech/audio/transform/transformation.py
index d24d6437c..e2f66dbf2 100644
--- a/paddlespeech/audio/transform/transformation.py
+++ b/paddlespeech/audio/transform/transformation.py
@@ -41,6 +41,7 @@ import_alias = dict(
utterance_cmvn="paddlespeech.audio.transform.cmvn:UtteranceCMVN",
fbank="paddlespeech.audio.transform.spectrogram:LogMelSpectrogram",
spectrogram="paddlespeech.audio.transform.spectrogram:Spectrogram",
+ wav_process="paddlespeech.audio.transform.spectrogram:WavProcess",
stft="paddlespeech.audio.transform.spectrogram:Stft",
istft="paddlespeech.audio.transform.spectrogram:IStft",
stft2fbank="paddlespeech.audio.transform.spectrogram:Stft2LogMelSpectrogram",
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 7296776f9..f9b4439ec 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -99,9 +99,8 @@ class ASRExecutor(BaseExecutor):
'-y',
action="store_true",
default=False,
- help='No additional parameters required. \
- Once set this parameter, it means accepting the request of the program by default, \
- which includes transforming the audio sample rate')
+ help='No additional parameters required. Once set this parameter, it means accepting the request of the program by default, which includes transforming the audio sample rate'
+ )
self.parser.add_argument(
'--rtf',
action="store_true",
@@ -341,7 +340,7 @@ class ASRExecutor(BaseExecutor):
audio = np.round(audio).astype("int16")
return audio
- def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False):
+ def _check(self, audio_file: str, sample_rate: int, force_yes: bool):
self.sample_rate = sample_rate
if self.sample_rate != 16000 and self.sample_rate != 8000:
logger.error(
@@ -435,17 +434,8 @@ class ASRExecutor(BaseExecutor):
for id_, input_ in task_source.items():
try:
- res = self(
- audio_file=input_,
- model=model,
- lang=lang,
- sample_rate=sample_rate,
- config=config,
- ckpt_path=ckpt_path,
- decode_method=decode_method,
- force_yes=force_yes,
- rtf=rtf,
- device=device)
+ res = self(input_, model, lang, sample_rate, config, ckpt_path,
+ decode_method, force_yes, rtf, device)
task_results[id_] = res
except Exception as e:
has_exceptions = True
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index b53eed88c..3800c36db 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -191,7 +191,7 @@ class BaseExecutor(ABC):
line = line.strip()
if not line:
continue
- k, v = line.split() # space or \t
+ k, v = line.split() # space or \t
job_contents[k] = v
return job_contents
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index 111987246..48ca1f98d 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -70,14 +70,6 @@ class VectorExecutor(BaseExecutor):
type=str,
default=None,
help="Checkpoint file of model.")
- self.parser.add_argument(
- '--yes',
- '-y',
- action="store_true",
- default=False,
- help='No additional parameters required. \
- Once set this parameter, it means accepting the request of the program by default, \
- which includes transforming the audio sample rate')
self.parser.add_argument(
'--config',
type=str,
@@ -117,7 +109,6 @@ class VectorExecutor(BaseExecutor):
sample_rate = parser_args.sample_rate
config = parser_args.config
ckpt_path = parser_args.ckpt_path
- force_yes = parser_args.yes
device = parser_args.device
# stage 1: configurate the verbose flag
@@ -137,14 +128,8 @@ class VectorExecutor(BaseExecutor):
# extract the speaker audio embedding
if parser_args.task == "spk":
logger.debug("do vector spk task")
- res = self(
- audio_file=input_,
- model=model,
- sample_rate=sample_rate,
- config=config,
- ckpt_path=ckpt_path,
- force_yes=force_yes,
- device=device)
+ res = self(input_, model, sample_rate, config, ckpt_path,
+ device)
task_result[id_] = res
elif parser_args.task == "score":
logger.debug("do vector score task")
@@ -160,22 +145,10 @@ class VectorExecutor(BaseExecutor):
logger.debug(
f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}"
)
- enroll_embedding = self(
- audio_file=enroll_audio,
- model=model,
- sample_rate=sample_rate,
- config=config,
- ckpt_path=ckpt_path,
- force_yes=force_yes,
- device=device)
- test_embedding = self(
- audio_file=test_audio,
- model=model,
- sample_rate=sample_rate,
- config=config,
- ckpt_path=ckpt_path,
- force_yes=force_yes,
- device=device)
+ enroll_embedding = self(enroll_audio, model, sample_rate,
+ config, ckpt_path, device)
+ test_embedding = self(test_audio, model, sample_rate,
+ config, ckpt_path, device)
# get the score
res = self.get_embeddings_score(enroll_embedding,
@@ -249,7 +222,6 @@ class VectorExecutor(BaseExecutor):
sample_rate: int=16000,
config: os.PathLike=None,
ckpt_path: os.PathLike=None,
- force_yes: bool=False,
device=paddle.get_device()):
"""Extract the audio embedding
@@ -268,7 +240,7 @@ class VectorExecutor(BaseExecutor):
"""
# stage 0: check the audio format
audio_file = os.path.abspath(audio_file)
- if not self._check(audio_file, sample_rate, force_yes):
+ if not self._check(audio_file, sample_rate):
sys.exit(-1)
# stage 1: set the paddle runtime host device
@@ -446,7 +418,7 @@ class VectorExecutor(BaseExecutor):
logger.debug("audio extract the feat success")
- def _check(self, audio_file: str, sample_rate: int, force_yes: bool=False):
+ def _check(self, audio_file: str, sample_rate: int):
"""Check if the model sample match the audio sample rate
Args:
@@ -490,34 +462,13 @@ class VectorExecutor(BaseExecutor):
logger.debug(f"The sample rate is {audio_sample_rate}")
if audio_sample_rate != self.sample_rate:
- logger.debug("The sample rate of the input file is not {}.\n \
+ logger.error("The sample rate of the input file is not {}.\n \
The program will resample the wav file to {}.\n \
If the result does not meet your expectations,\n \
Please input the 16k 16 bit 1 channel wav file. \
".format(self.sample_rate, self.sample_rate))
- if force_yes is False:
- while (True):
- logger.debug(
- "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
- )
- content = input("Input(Y/N):")
- if content.strip() == "Y" or content.strip(
- ) == "y" or content.strip() == "yes" or content.strip(
- ) == "Yes":
- logger.debug(
- "change the sampele rate, channel to 16k and 1 channel"
- )
- break
- elif content.strip() == "N" or content.strip(
- ) == "n" or content.strip() == "no" or content.strip(
- ) == "No":
- logger.debug("Exit the program")
- return False
- else:
- logger.warning("Not regular input, please input again")
- self.change_format = True
+ sys.exit(-1)
else:
logger.debug("The audio file format is right")
- self.change_format = False
return True
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index f049879a3..872d564cd 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -1363,11 +1363,5 @@ g2pw_onnx_models = {
'md5':
'7e049a55547da840502cf99e8a64f20e',
},
- '1.1': {
- 'url':
- 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/g2p/G2PWModel_1.1.zip',
- 'md5':
- 'f8b60501770bff92ed6ce90860a610e6',
- },
},
}
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 5fe2e16b9..f6476b9aa 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -114,7 +114,6 @@ if not hasattr(paddle.Tensor, 'new_full'):
paddle.Tensor.new_full = new_full
paddle.static.Variable.new_full = new_full
-
def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
return xs
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
index 66ea29d08..90b7d8a18 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@@ -20,8 +20,8 @@ import paddle
import soundfile
from yacs.config import CfgNode
-from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils import mp_tools
@@ -38,24 +38,24 @@ class DeepSpeech2Tester_hub():
self.args = args
self.config = config
self.audio_file = args.audio_file
-
- self.preprocess_conf = config.preprocess_config
- self.preprocess_args = {"train": False}
- self.preprocessing = Transformation(self.preprocess_conf)
-
- self.text_feature = TextFeaturizer(
- unit_type=config.unit_type,
- vocab=config.vocab_filepath,
- spm_model_prefix=config.spm_model_prefix)
- paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
+ self.collate_fn_test = SpeechCollator.from_config(config)
+ self._text_featurizer = TextFeaturizer(
+ unit_type=config.unit_type, vocab=None)
def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
- decode_batch_size = cfg.decode_batch_size
- self.model.decoder.init_decoder(
- decode_batch_size, vocab_list, cfg.decoding_method,
- cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size,
- cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch)
- result_transcripts = self.model.decode(audio, audio_len)
+ result_transcripts = self.model.decode(
+ audio,
+ audio_len,
+ vocab_list,
+ decoding_method=cfg.decoding_method,
+ lang_model_path=cfg.lang_model_path,
+ beam_alpha=cfg.alpha,
+ beam_beta=cfg.beta,
+ beam_size=cfg.beam_size,
+ cutoff_prob=cfg.cutoff_prob,
+ cutoff_top_n=cfg.cutoff_top_n,
+ num_processes=cfg.num_proc_bsearch)
+
return result_transcripts
@mp_tools.rank_zero_only
@@ -64,23 +64,16 @@ class DeepSpeech2Tester_hub():
self.model.eval()
cfg = self.config
audio_file = self.audio_file
-
- audio, sample_rate = soundfile.read(
- self.audio_file, dtype="int16", always_2d=True)
-
- audio = audio[:, 0]
- logger.info(f"audio shape: {audio.shape}")
-
- # fbank
- feat = self.preprocessing(audio, **self.preprocess_args)
- logger.info(f"feat shape: {feat.shape}")
-
- audio_len = paddle.to_tensor(feat.shape[0])
- audio = paddle.to_tensor(feat, dtype='float32').unsqueeze(axis=0)
-
+ collate_fn_test = self.collate_fn_test
+ audio, _ = collate_fn_test.process_utterance(
+ audio_file=audio_file, transcript=" ")
+ audio_len = audio.shape[0]
+ audio = paddle.to_tensor(audio, dtype='float32')
+ audio_len = paddle.to_tensor(audio_len)
+ audio = paddle.unsqueeze(audio, axis=0)
+ vocab_list = collate_fn_test.vocab_list
result_transcripts = self.compute_result_transcripts(
- audio, audio_len, self.text_feature.vocab_list, cfg.decode)
-
+ audio, audio_len, vocab_list, cfg.decode)
logger.info("result_transcripts: " + result_transcripts[0])
def run_test(self):
@@ -116,9 +109,11 @@ class DeepSpeech2Tester_hub():
def setup_model(self):
config = self.config.clone()
with UpdateConfig(config):
- config.input_dim = config.feat_dim
- config.output_dim = self.text_feature.vocab_size
+ config.input_dim = self.collate_fn_test.feature_size
+ config.output_dim = self.collate_fn_test.vocab_size
+
model = DeepSpeech2Model.from_config(config)
+
self.model = model
def setup_checkpointer(self):
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index db60083b0..67186081c 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -25,6 +25,8 @@ import paddle
from paddle import distributed as dist
from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.io.dataloader import StreamDataLoader
from paddlespeech.s2t.io.dataloader import DataLoaderFactory
from paddlespeech.s2t.models.u2 import U2Model
from paddlespeech.s2t.training.optimizer import OptimizerFactory
@@ -107,8 +109,7 @@ class U2Trainer(Trainer):
def valid(self):
self.model.eval()
if not self.use_streamdata:
- logger.info(
- f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+ logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
valid_losses = defaultdict(list)
num_seen_utts = 1
total_loss = 0.0
@@ -135,8 +136,7 @@ class U2Trainer(Trainer):
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
if not self.use_streamdata:
- msg += "batch: {}/{}, ".format(i + 1,
- len(self.valid_loader))
+ msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in valid_dump.items())
logger.info(msg)
@@ -157,8 +157,7 @@ class U2Trainer(Trainer):
self.before_train()
if not self.use_streamdata:
- logger.info(
- f"Train Total Examples: {len(self.train_loader.dataset)}")
+ logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.n_epoch:
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
@@ -226,18 +225,14 @@ class U2Trainer(Trainer):
config = self.config.clone()
self.use_streamdata = config.get("use_stream_data", False)
if self.train:
- self.train_loader = DataLoaderFactory.get_dataloader(
- 'train', config, self.args)
- self.valid_loader = DataLoaderFactory.get_dataloader(
- 'valid', config, self.args)
+ self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+ self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
logger.info("Setup train/valid Dataloader!")
else:
decode_batch_size = config.get('decode', dict()).get(
'decode_batch_size', 1)
- self.test_loader = DataLoaderFactory.get_dataloader('test', config,
- self.args)
- self.align_loader = DataLoaderFactory.get_dataloader(
- 'align', config, self.args)
+ self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
+ self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
logger.info("Setup test/align Dataloader!")
def setup_model(self):
@@ -250,8 +245,7 @@ class U2Trainer(Trainer):
model_conf.output_dim = self.train_loader.vocab_size
else:
model_conf.input_dim = self.test_loader.feat_dim
- model_conf.output_dim = self.test_loader.vocab_size
-
+ model_conf.output_dim = 5538
model = U2Model.from_config(model_conf)
if self.parallel:
@@ -316,6 +310,11 @@ class U2Tester(U2Trainer):
unit_type=self.config.unit_type,
vocab=self.config.vocab_filepath,
spm_model_prefix=self.config.spm_model_prefix)
+
+ self.text_feature_test = TextFeaturizer(
+ unit_type=self.config.unit_type,
+ vocab='/home/zhangtianhao/workspace/PaddleSpeech/examples/aishell/asr1/data/lang_char/vocab.txt',
+ spm_model_prefix=self.config.spm_model_prefix)
self.vocab_list = self.text_feature.vocab_list
def id2token(self, texts, texts_len, text_feature):
@@ -340,7 +339,7 @@ class U2Tester(U2Trainer):
error_rate_func = error_rate.cer if decode_config.error_rate_type == 'cer' else error_rate.wer
start_time = time.time()
- target_transcripts = self.id2token(texts, texts_len, self.text_feature)
+ target_transcripts = self.id2token(texts, texts_len, self.text_feature_test)
result_transcripts, result_tokenids = self.model.decode(
audio,
audio_len,
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index 073d74293..cb015c116 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -105,8 +105,7 @@ class U2Trainer(Trainer):
def valid(self):
self.model.eval()
if not self.use_streamdata:
- logger.info(
- f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+ logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
valid_losses = defaultdict(list)
num_seen_utts = 1
total_loss = 0.0
@@ -134,8 +133,7 @@ class U2Trainer(Trainer):
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
if not self.use_streamdata:
- msg += "batch: {}/{}, ".format(i + 1,
- len(self.valid_loader))
+ msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in valid_dump.items())
logger.info(msg)
@@ -155,8 +153,7 @@ class U2Trainer(Trainer):
self.before_train()
if not self.use_streamdata:
- logger.info(
- f"Train Total Examples: {len(self.train_loader.dataset)}")
+ logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.n_epoch:
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
@@ -168,8 +165,8 @@ class U2Trainer(Trainer):
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
if not self.use_streamdata:
- msg += "batch : {}/{}, ".format(
- batch_index + 1, len(self.train_loader))
+ msg += "batch : {}/{}, ".format(batch_index + 1,
+ len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
@@ -207,24 +204,21 @@ class U2Trainer(Trainer):
self.use_streamdata = config.get("use_stream_data", False)
if self.train:
config = self.config.clone()
- self.train_loader = DataLoaderFactory.get_dataloader(
- 'train', config, self.args)
+ self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
config = self.config.clone()
config['preprocess_config'] = None
- self.valid_loader = DataLoaderFactory.get_dataloader(
- 'valid', config, self.args)
+ self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
logger.info("Setup train/valid Dataloader!")
else:
config = self.config.clone()
config['preprocess_config'] = None
- self.test_loader = DataLoaderFactory.get_dataloader('test', config,
- self.args)
+ self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
config = self.config.clone()
config['preprocess_config'] = None
- self.align_loader = DataLoaderFactory.get_dataloader(
- 'align', config, self.args)
+ self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
logger.info("Setup test/align Dataloader!")
+
def setup_model(self):
config = self.config
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index d57c49546..603825435 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -121,8 +121,7 @@ class U2STTrainer(Trainer):
def valid(self):
self.model.eval()
if not self.use_streamdata:
- logger.info(
- f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+ logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
valid_losses = defaultdict(list)
num_seen_utts = 1
total_loss = 0.0
@@ -156,8 +155,7 @@ class U2STTrainer(Trainer):
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
if not self.use_streamdata:
- msg += "batch: {}/{}, ".format(i + 1,
- len(self.valid_loader))
+ msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in valid_dump.items())
logger.info(msg)
@@ -177,8 +175,7 @@ class U2STTrainer(Trainer):
self.before_train()
if not self.use_streamdata:
- logger.info(
- f"Train Total Examples: {len(self.train_loader.dataset)}")
+ logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
while self.epoch < self.config.n_epoch:
with Timer("Epoch-Train Time Cost: {}"):
self.model.train()
@@ -251,16 +248,14 @@ class U2STTrainer(Trainer):
config['load_transcript'] = load_transcript
self.use_streamdata = config.get("use_stream_data", False)
if self.train:
- self.train_loader = DataLoaderFactory.get_dataloader(
- 'train', config, self.args)
- self.valid_loader = DataLoaderFactory.get_dataloader(
- 'valid', config, self.args)
+ self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+ self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
logger.info("Setup train/valid Dataloader!")
else:
- self.test_loader = DataLoaderFactory.get_dataloader('test', config,
- self.args)
+ self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
logger.info("Setup test Dataloader!")
+
def setup_model(self):
config = self.config
model_conf = config
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py b/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py
new file mode 100644
index 000000000..185a92b8d
--- /dev/null
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test.py b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
new file mode 100644
index 000000000..4d16d9fa9
--- /dev/null
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+# TODO(hui zhang): dynamic load
+
+
+def main_sp(config, args):
+ exp = Tester(config, args)
+ with exp.eval():
+ exp.setup()
+ exp.run_test()
+
+
+def main(config, args):
+ main_sp(config, args)
+
+
+if __name__ == "__main__":
+ parser = default_argument_parser()
+ # save asr result to
+ parser.add_argument(
+ '--dict-path', type=str, default=None, help='dict path.')
+ parser.add_argument(
+ "--result_file", type=str, help="path of save the asr result")
+ args = parser.parse_args()
+ print_arguments(args, globals())
+
+ # https://yaml.org/type/float.html
+ config = CfgNode(new_allowed=True)
+ if args.config:
+ config.merge_from_file(args.config)
+ if args.decode_cfg:
+ decode_confs = CfgNode(new_allowed=True)
+ decode_confs.merge_from_file(args.decode_cfg)
+ config.decode = decode_confs
+ if args.opts:
+ config.merge_from_list(args.opts)
+ config.freeze()
+ print(config)
+ if args.dump_config:
+ with open(args.dump_config, 'w') as f:
+ print(config, file=f)
+
+ # Setting for profiling
+ pr = cProfile.Profile()
+ pr.runcall(main, config, args)
+ pr.dump_stats('test.profile')
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/train.py b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
new file mode 100644
index 000000000..b977b2a15
--- /dev/null
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+ exp = Trainer(config, args)
+ exp.setup()
+ exp.run()
+
+
+def main(config, args):
+ main_sp(config, args)
+
+
+if __name__ == "__main__":
+ parser = default_argument_parser()
+ args = parser.parse_args()
+ print_arguments(args, globals())
+
+ # https://yaml.org/type/float.html
+ config = CfgNode(new_allowed=True)
+ if args.config:
+ config.merge_from_file(args.config)
+ if args.opts:
+ config.merge_from_list(args.opts)
+ config.freeze()
+ print(config)
+ if args.dump_config:
+ with open(args.dump_config, 'w') as f:
+ print(config, file=f)
+
+ # Setting for profiling
+ pr = cProfile.Profile()
+ pr.runcall(main, config, args)
+ pr.dump_stats(os.path.join(args.output, 'train.profile'))
diff --git a/paddlespeech/s2t/exps/wav2vec2/model.py b/paddlespeech/s2t/exps/wav2vec2/model.py
new file mode 100644
index 000000000..587a279b3
--- /dev/null
+++ b/paddlespeech/s2t/exps/wav2vec2/model.py
@@ -0,0 +1,465 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains wav2vec2 model."""
+import json
+import os
+import time
+from collections import defaultdict
+from collections import OrderedDict
+from contextlib import nullcontext
+from paddlespeech.s2t.utils import mp_tools
+
+import jsonlines
+import numpy as np
+import paddle
+from paddle import distributed as dist
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
+from paddlespeech.s2t.io.dataloader import StreamDataLoader
+from paddlespeech.s2t.io.dataloader import DataLoaderFactory
+from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
+from paddlespeech.s2t.utils import error_rate
+
+
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils.log import Log
+
+from paddlespeech.s2t.models.wav2vec2.speechbrain.processing.speech_augmentation import TimeDomainSpecAugment
+import pdb
+
+
+logger = Log(__name__).getlog()
+
+class Wav2Vec2ASRTrainer(Trainer):
+ def __init__(self, config, args):
+ super().__init__(config, args)
+
+ def train_batch(self, batch_index, batch, msg):
+ train_conf = self.config
+ start = time.time()
+
+ # forward
+ utt, wav, wavs_lens, target, target_lens = batch
+ wavs_lens_rate = wavs_lens / wav.shape[1]
+ target_lens_rate = target_lens / target.shape[1]
+ wav = wav[:,:,0]
+ if train_conf.augment:
+ wav = self.speech_augmentation(wav, wavs_lens_rate)
+ loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
+ # print(self.model.wav2vec2.feature_projection.projection.weight)
+ # print(self.model.wav2vec2.feature_extractor.conv_layers[0].conv.weight)
+
+ # loss div by `batch_size * accum_grad`
+ loss /= train_conf.accum_grad
+ losses_np = {'loss': float(loss) * train_conf.accum_grad}
+
+ # loss backward
+ if (batch_index + 1) % train_conf.accum_grad != 0:
+ # Disable gradient synchronizations across DDP processes.
+ # Within this context, gradients will be accumulated on module
+ # variables, which will later be synchronized.
+ # When using cpu w/o DDP, model does not have `no_sync`
+ context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+ self.parallel) else nullcontext
+ else:
+ # Used for single gpu training and DDP gradient synchronization
+ # processes.
+ context = nullcontext
+ with context():
+ loss.backward()
+ layer_tools.print_grads(self.model, print_func=None)
+
+ # optimizer step old
+ if (batch_index + 1) % train_conf.accum_grad == 0:
+ self.optimizer.step()
+ self.optimizer.clear_grad()
+ self.lr_scheduler.step()
+ self.iteration += 1
+ # optimizer step new
+ # if (batch_index + 1) % train_conf.accum_grad == 0:
+ # self.optimizer.step()
+ # self.optimizer.clear_grad()
+ # self.iteration += 1
+
+ iteration_time = time.time() - start
+
+ for k, v in losses_np.items():
+ report(k, v)
+ report("batch_size", self.config.batch_size)
+ report("accum", train_conf.accum_grad)
+ report("step_cost", iteration_time)
+
+ if (batch_index + 1) % train_conf.accum_grad == 0:
+ if dist.get_rank() == 0 and self.visualizer:
+ losses_np_v = losses_np.copy()
+ losses_np_v.update({"lr": self.lr_scheduler()})
+ for key, val in losses_np_v.items():
+ self.visualizer.add_scalar(
+ tag='train/' + key, value=val, step=self.iteration - 1)
+
+ @paddle.no_grad()
+ def valid(self):
+ self.model.eval()
+ if not self.use_streamdata:
+ logger.info(f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+ valid_losses = defaultdict(list)
+ num_seen_utts = 1
+ total_loss = 0.0
+ for i, batch in enumerate(self.valid_loader):
+ utt, wav, wavs_lens, target, target_lens = batch
+ wavs_lens_rate = wavs_lens / wav.shape[1]
+ target_lens_rate = target_lens / target.shape[1]
+ wav = wav[:,:,0]
+ loss = self.model(wav, wavs_lens_rate, target, target_lens_rate)
+
+ if paddle.isfinite(loss):
+ num_utts = batch[1].shape[0]
+ num_seen_utts += num_utts
+ total_loss += float(loss) * num_utts
+ valid_losses['val_loss'].append(float(loss))
+
+ if (i + 1) % self.config.log_interval == 0:
+ valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
+ valid_dump['val_history_loss'] = total_loss / num_seen_utts
+
+ # logging
+ msg = f"Valid: Rank: {dist.get_rank()}, "
+ msg += "epoch: {}, ".format(self.epoch)
+ msg += "step: {}, ".format(self.iteration)
+ if not self.use_streamdata:
+ msg += "batch: {}/{}, ".format(i + 1, len(self.valid_loader))
+ msg += ', '.join('{}: {:>.6f}'.format(k, v)
+ for k, v in valid_dump.items())
+ logger.info(msg)
+
+ logger.info('Rank {} Val info val_loss {}'.format(
+ dist.get_rank(), total_loss / num_seen_utts))
+ return total_loss, num_seen_utts
+
+ def do_train(self):
+ """The training process control by step."""
+ # !!!IMPORTANT!!!
+ # Try to export the model by script, if fails, we should refine
+ # the code to satisfy the script export requirements
+ # script_model = paddle.jit.to_static(self.model)
+ # script_model_path = str(self.checkpoint_dir / 'init')
+ # paddle.jit.save(script_model, script_model_path)
+
+ self.before_train()
+
+ if not self.use_streamdata:
+ logger.info(f"Train Total Examples: {len(self.train_loader.dataset)}")
+ while self.epoch < self.config.n_epoch:
+ with Timer("Epoch-Train Time Cost: {}"):
+ self.model.train()
+ try:
+ data_start_time = time.time()
+ for batch_index, batch in enumerate(self.train_loader):
+ dataload_time = time.time() - data_start_time
+ msg = "Train:"
+ observation = OrderedDict()
+ with ObsScope(observation):
+ report("Rank", dist.get_rank())
+ report("epoch", self.epoch)
+ report('step', self.iteration)
+ report("lr", self.lr_scheduler())
+ self.train_batch(batch_index, batch, msg)
+ self.after_train_batch()
+ report('iter', batch_index + 1)
+ if not self.use_streamdata:
+ report('total', len(self.train_loader))
+ report('reader_cost', dataload_time)
+ observation['batch_cost'] = observation[
+ 'reader_cost'] + observation['step_cost']
+ observation['samples'] = observation['batch_size']
+ observation['ips,samples/s'] = observation[
+ 'batch_size'] / observation['batch_cost']
+ for k, v in observation.items():
+ msg += f" {k.split(',')[0]}: "
+ msg += f"{v:>.8f}" if isinstance(v,
+ float) else f"{v}"
+ msg += f" {k.split(',')[1]}" if len(
+ k.split(',')) == 2 else ""
+ msg += ","
+ msg = msg[:-1] # remove the last ","
+ if (batch_index + 1) % self.config.log_interval == 0:
+ logger.info(msg)
+ data_start_time = time.time()
+ except Exception as e:
+ logger.error(e)
+ raise e
+ with Timer("Eval Time Cost: {}"):
+ total_loss, num_seen_utts = self.valid()
+ if dist.get_world_size() > 1:
+ num_seen_utts = paddle.to_tensor(num_seen_utts)
+ # the default operator in all_reduce function is sum.
+ dist.all_reduce(num_seen_utts)
+ total_loss = paddle.to_tensor(total_loss)
+ dist.all_reduce(total_loss)
+ cv_loss = total_loss / num_seen_utts
+ cv_loss = float(cv_loss)
+ else:
+ cv_loss = total_loss / num_seen_utts
+
+ logger.info(
+ 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+ if self.visualizer:
+ self.visualizer.add_scalar(
+ tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+ self.visualizer.add_scalar(
+ tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
+ self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+ self.new_epoch()
+
+ def setup_dataloader(self):
+ config = self.config.clone()
+ self.use_streamdata = config.get("use_stream_data", False)
+ if self.train:
+ self.train_loader = DataLoaderFactory.get_dataloader('train', config, self.args)
+ self.valid_loader = DataLoaderFactory.get_dataloader('valid', config, self.args)
+ logger.info("Setup train/valid Dataloader!")
+ else:
+ decode_batch_size = config.get('decode', dict()).get(
+ 'decode_batch_size', 1)
+ self.test_loader = DataLoaderFactory.get_dataloader('test', config, self.args)
+ self.align_loader = DataLoaderFactory.get_dataloader('align', config, self.args)
+ logger.info("Setup test/align Dataloader!")
+
+ def setup_model(self):
+ config = self.config
+ model_conf = config
+
+ with UpdateConfig(model_conf):
+ if self.train:
+ model_conf.input_dim = self.train_loader.feat_dim
+ model_conf.output_dim = self.train_loader.vocab_size
+ else:
+ model_conf.input_dim = self.test_loader.feat_dim
+ model_conf.output_dim = self.test_loader.vocab_size
+
+ model = Wav2vec2ASR.from_config(model_conf)
+
+ if self.parallel:
+ model = paddle.DataParallel(model)
+
+ # logger.info(f"{model}")
+ layer_tools.print_params(model, logger.info)
+ self.model = model
+ logger.info("Setup model!")
+ if model_conf.augment:
+ self.speech_augmentation = TimeDomainSpecAugment(sample_rate=16000, speeds=[95, 100, 105])
+
+ if not self.train:
+ return
+
+ train_config = config
+ optim_type = train_config.model_optim
+ optim_conf = train_config.model_optim_conf
+ scheduler_type = train_config.scheduler
+ scheduler_conf = train_config.scheduler_conf
+
+ scheduler_args = {
+ "learning_rate": optim_conf.lr,
+ "verbose": False,
+ "warmup_steps": scheduler_conf.warmup_steps,
+ "gamma": scheduler_conf.lr_decay,
+ "d_model": model_conf.dnn_neurons,
+ }
+ lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+ scheduler_args)
+
+ def optimizer_args(
+ config,
+ parameters,
+ lr_scheduler=None, ):
+ train_config = config
+ optim_type = train_config.model_optim
+ optim_conf = train_config.model_optim_conf
+ scheduler_type = train_config.scheduler
+ scheduler_conf = train_config.scheduler_conf
+ return {
+ "grad_clip": train_config.global_grad_clip,
+ "learning_rate": lr_scheduler
+ if lr_scheduler else optim_conf.lr,
+ "epsilon": optim_conf.epsilon,
+ "rho": optim_conf.rho,
+ "parameters": parameters,
+ "epsilon": 1e-9 if optim_type == 'noam' else None,
+ "beta1": 0.9 if optim_type == 'noam' else None,
+ "beat2": 0.98 if optim_type == 'noam' else None,
+ }
+
+ # optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+ optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+
+ optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
+
+ self.optimizer = optimizer
+ self.lr_scheduler = lr_scheduler
+ logger.info("Setup optimizer/lr_scheduler!")
+
+
+
+class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
+ def __init__(self, config, args):
+ super().__init__(config, args)
+ print(config)
+ self.text_featurizer = TextFeaturizer(
+ unit_type=config.unit_type, vocab=config.vocab_filepath)
+ self.vocab_list = self.text_featurizer.vocab_list
+
+ def id2token(self, texts, texts_len):
+ """ ord() id to chr() chr """
+ trans = []
+ for text, n in zip(texts, texts_len):
+ n = n.numpy().item()
+ ids = text[:n]
+ trans.append(
+ self.text_featurizer.defeaturize(ids.numpy().tolist()))
+ return trans
+
+ def compute_metrics(self,
+ utts,
+ audio,
+ audio_len,
+ texts,
+ texts_len,
+ fout=None):
+ decode_cfg = self.config.decode
+ errors_sum, len_refs, num_ins = 0.0, 0, 0
+ errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+ error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+
+ start_time = time.time()
+ target_transcripts = self.id2token(texts, texts_len)
+ result_transcripts, result_tokenids = self.model.decode(
+ audio,
+ audio_len,
+ text_feature=self.text_featurizer,
+ decoding_method=decode_cfg.decoding_method,
+ beam_size=decode_cfg.beam_size)
+ decode_time = time.time() - start_time
+
+ for utt, target, result, rec_tids in zip(
+ utts, target_transcripts, result_transcripts, result_tokenids):
+ errors, len_ref = errors_func(target, result)
+ errors_sum += errors
+ len_refs += len_ref
+ num_ins += 1
+ if fout:
+ fout.write({
+ "utt": utt,
+ "refs": [target],
+ "hyps": [result],
+ "hyps_tokenid": [rec_tids],
+ })
+ logger.info(f"Utt: {utt}")
+ logger.info(f"Ref: {target}")
+ logger.info(f"Hyp: {result}")
+ logger.info("One example error rate [%s] = %f" % (
+ decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+ return dict(
+ errors_sum=errors_sum,
+ len_refs=len_refs,
+ num_ins=num_ins, # num examples
+ error_rate=errors_sum / len_refs,
+ error_rate_type=decode_cfg.error_rate_type,
+ num_frames=audio_len.sum().numpy().item(),
+ decode_time=decode_time)
+
+ @mp_tools.rank_zero_only
+ @paddle.no_grad()
+ def test(self):
+ logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+ self.model.eval()
+
+ error_rate_type = None
+ errors_sum, len_refs, num_ins = 0.0, 0, 0
+ num_frames = 0.0
+ num_time = 0.0
+ # Initialized the decoder in model
+ decode_cfg = self.config.decode
+ vocab_list = self.vocab_list
+ decode_batch_size = decode_cfg.decode_batch_size
+ # self.model.decoder.init_decoder(
+ # decode_batch_size, vocab_list, decode_cfg.decoding_method,
+ # decode_cfg.lang_model_path, decode_cfg.alpha, decode_cfg.beta,
+ # decode_cfg.beam_size, decode_cfg.cutoff_prob,
+ # decode_cfg.cutoff_top_n, decode_cfg.num_proc_bsearch)
+
+ with jsonlines.open(self.args.result_file, 'w') as fout:
+ for i, batch in enumerate(self.test_loader):
+ metrics = self.compute_metrics(*batch, fout=fout)
+ num_frames += metrics['num_frames']
+ num_time += metrics["decode_time"]
+ errors_sum += metrics['errors_sum']
+ len_refs += metrics['len_refs']
+ num_ins += metrics['num_ins']
+ error_rate_type = metrics['error_rate_type']
+ rtf = num_time / (num_frames)
+ logger.info(
+ "RTF: %f, Error rate [%s] (%d/?) = %f" %
+ (rtf, error_rate_type, num_ins, errors_sum / len_refs))
+
+ # logging
+ msg = "Test: "
+ msg += "epoch: {}, ".format(self.epoch)
+ msg += "step: {}, ".format(self.iteration)
+ msg += "Final error rate [%s] (%d/%d) = %f" % (
+ error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+ logger.info(msg)
+
+ err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
+ err_type_str = "{}".format(error_rate_type)
+ with open(err_meta_path, 'w') as f:
+ data = json.dumps({
+ "epoch":
+ self.epoch,
+ "step":
+ self.iteration,
+ "rtf":
+ rtf,
+ error_rate_type:
+ errors_sum / len_refs,
+ "dataset_hour": (num_frames) / 1000.0 / 3600.0,
+ "process_hour":
+ num_time / 1000.0 / 3600.0,
+ "num_examples":
+ num_ins,
+ "err_sum":
+ errors_sum,
+ "ref_len":
+ len_refs,
+ "decode_method":
+ self.config.decode.decoding_method,
+ })
+ f.write(data + '\n')
+
+ @paddle.no_grad()
+ def export(self):
+ infer_model = DeepSpeech2InferModel.from_pretrained(
+ self.test_loader, self.config, self.args.checkpoint_path)
+ infer_model.eval()
+ static_model = infer_model.export()
+ logger.info(f"Export code: {static_model.forward.code}")
+ paddle.jit.save(static_model, self.args.export_path)
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 4cc8274f9..735d29da2 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -22,16 +22,17 @@ import paddle
from paddle.io import BatchSampler
from paddle.io import DataLoader
from paddle.io import DistributedBatchSampler
-from yacs.config import CfgNode
-import paddlespeech.audio.streamdata as streamdata
-from paddlespeech.audio.text.text_featurizer import TextFeaturizer
from paddlespeech.s2t.io.batchfy import make_batchset
from paddlespeech.s2t.io.converter import CustomConverter
from paddlespeech.s2t.io.dataset import TransformDataset
from paddlespeech.s2t.io.reader import LoadInputsAndTargets
from paddlespeech.s2t.utils.log import Log
+import paddlespeech.audio.streamdata as streamdata
+from paddlespeech.audio.text.text_featurizer import TextFeaturizer
+from yacs.config import CfgNode
+
__all__ = ["BatchDataLoader", "StreamDataLoader"]
logger = Log(__name__).getlog()
@@ -60,7 +61,6 @@ def batch_collate(x):
"""
return x[0]
-
def read_preprocess_cfg(preprocess_conf_file):
augment_conf = dict()
preprocess_cfg = CfgNode(new_allowed=True)
@@ -82,8 +82,7 @@ def read_preprocess_cfg(preprocess_conf_file):
augment_conf['num_t_mask'] = process['n_mask']
augment_conf['t_inplace'] = process['inplace']
augment_conf['t_replace_with_zero'] = process['replace_with_zero']
- return augment_conf
-
+ return augment_conf
class StreamDataLoader():
def __init__(self,
@@ -96,12 +95,12 @@ class StreamDataLoader():
frame_length=25,
frame_shift=10,
dither=0.0,
- minlen_in: float=0.0,
+ minlen_in: float=0.0,
maxlen_in: float=float('inf'),
minlen_out: float=0.0,
maxlen_out: float=float('inf'),
resample_rate: int=16000,
- shuffle_size: int=10000,
+ shuffle_size: int=10000,
sort_size: int=1000,
n_iter_processes: int=1,
prefetch_factor: int=2,
@@ -117,11 +116,11 @@ class StreamDataLoader():
text_featurizer = TextFeaturizer(unit_type, vocab_filepath)
symbol_table = text_featurizer.vocab_dict
- self.feat_dim = num_mel_bins
- self.vocab_size = text_featurizer.vocab_size
-
+ self.feat_dim = num_mel_bins
+ self.vocab_size = text_featurizer.vocab_size
+
augment_conf = read_preprocess_cfg(preprocess_conf)
-
+
# The list of shard
shardlist = []
with open(manifest_file, "r") as f:
@@ -129,68 +128,58 @@ class StreamDataLoader():
shardlist.append(line.strip())
world_size = 1
try:
- world_size = paddle.distributed.get_world_size()
+ world_size = paddle.distributed.get_world_size()
except Exception as e:
logger.warninig(e)
- logger.warninig(
- "can not get world_size using paddle.distributed.get_world_size(), use world_size=1"
- )
- assert len(shardlist) >= world_size, \
- "the length of shard list should >= number of gpus/xpus/..."
+ logger.warninig("can not get world_size using paddle.distributed.get_world_size(), use world_size=1")
+ assert(len(shardlist) >= world_size, "the length of shard list should >= number of gpus/xpus/...")
- update_n_iter_processes = int(
- max(min(len(shardlist) / world_size - 1, self.n_iter_processes), 0))
+ update_n_iter_processes = int(max(min(len(shardlist)/world_size - 1, self.n_iter_processes), 0))
logger.info(f"update_n_iter_processes {update_n_iter_processes}")
if update_n_iter_processes != self.n_iter_processes:
- self.n_iter_processes = update_n_iter_processes
+ self.n_iter_processes = update_n_iter_processes
logger.info(f"change nun_workers to {self.n_iter_processes}")
if self.dist_sampler:
base_dataset = streamdata.DataPipeline(
- streamdata.SimpleShardList(shardlist), streamdata.split_by_node
- if train_mode else streamdata.placeholder(),
+ streamdata.SimpleShardList(shardlist),
+ streamdata.split_by_node if train_mode else streamdata.placeholder(),
streamdata.split_by_worker,
- streamdata.tarfile_to_samples(streamdata.reraise_exception))
+ streamdata.tarfile_to_samples(streamdata.reraise_exception)
+ )
else:
base_dataset = streamdata.DataPipeline(
streamdata.SimpleShardList(shardlist),
streamdata.split_by_worker,
- streamdata.tarfile_to_samples(streamdata.reraise_exception))
+ streamdata.tarfile_to_samples(streamdata.reraise_exception)
+ )
self.dataset = base_dataset.append_list(
streamdata.audio_tokenize(symbol_table),
- streamdata.audio_data_filter(
- frame_shift=frame_shift,
- max_length=maxlen_in,
- min_length=minlen_in,
- token_max_length=maxlen_out,
- token_min_length=minlen_out),
+ streamdata.audio_data_filter(frame_shift=frame_shift, max_length=maxlen_in, min_length=minlen_in, token_max_length=maxlen_out, token_min_length=minlen_out),
streamdata.audio_resample(resample_rate=resample_rate),
- streamdata.audio_compute_fbank(
- num_mel_bins=num_mel_bins,
- frame_length=frame_length,
- frame_shift=frame_shift,
- dither=dither),
- streamdata.audio_spec_aug(**augment_conf)
- if train_mode else streamdata.placeholder(
- ), # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80)
+ streamdata.audio_compute_fbank(num_mel_bins=num_mel_bins, frame_length=frame_length, frame_shift=frame_shift, dither=dither),
+ streamdata.audio_spec_aug(**augment_conf) if train_mode else streamdata.placeholder(), # num_t_mask=2, num_f_mask=2, max_t=40, max_f=30, max_w=80)
streamdata.shuffle(shuffle_size),
streamdata.sort(sort_size=sort_size),
streamdata.batched(batch_size),
streamdata.audio_padding(),
- streamdata.audio_cmvn(cmvn_file))
+ streamdata.audio_cmvn(cmvn_file)
+ )
if paddle.__version__ >= '2.3.2':
self.loader = streamdata.WebLoader(
- self.dataset,
- num_workers=self.n_iter_processes,
- prefetch_factor=self.prefetch_factor,
- batch_size=None)
+ self.dataset,
+ num_workers=self.n_iter_processes,
+ prefetch_factor = self.prefetch_factor,
+ batch_size=None
+ )
else:
self.loader = streamdata.WebLoader(
- self.dataset,
- num_workers=self.n_iter_processes,
- batch_size=None)
+ self.dataset,
+ num_workers=self.n_iter_processes,
+ batch_size=None
+ )
def __iter__(self):
return self.loader.__iter__()
@@ -199,9 +188,7 @@ class StreamDataLoader():
return self.__iter__()
def __len__(self):
- logger.info(
- "Stream dataloader does not support calculate the length of the dataset"
- )
+ logger.info("Stream dataloader does not support calculate the length of the dataset")
return -1
@@ -360,7 +347,7 @@ class DataLoaderFactory():
config['train_mode'] = True
elif mode == 'valid':
config['manifest'] = config.dev_manifest
- config['train_mode'] = False
+ config['train_mode'] = False
elif model == 'test' or mode == 'align':
config['manifest'] = config.test_manifest
config['train_mode'] = False
@@ -371,31 +358,30 @@ class DataLoaderFactory():
config['maxlen_out'] = float('inf')
config['dist_sampler'] = False
else:
- raise KeyError(
- "not valid mode type!!, please input one of 'train, valid, test, align'"
- )
+ raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'")
return StreamDataLoader(
- manifest_file=config.manifest,
- train_mode=config.train_mode,
- unit_type=config.unit_type,
- preprocess_conf=config.preprocess_config,
- batch_size=config.batch_size,
- num_mel_bins=config.feat_dim,
- frame_length=config.window_ms,
- frame_shift=config.stride_ms,
- dither=config.dither,
- minlen_in=config.minlen_in,
- maxlen_in=config.maxlen_in,
- minlen_out=config.minlen_out,
- maxlen_out=config.maxlen_out,
- resample_rate=config.resample_rate,
- shuffle_size=config.shuffle_size,
- sort_size=config.sort_size,
- n_iter_processes=config.num_workers,
- prefetch_factor=config.prefetch_factor,
- dist_sampler=config.dist_sampler,
- cmvn_file=config.cmvn_file,
- vocab_filepath=config.vocab_filepath, )
+ manifest_file=config.manifest,
+ train_mode=config.train_mode,
+ unit_type=config.unit_type,
+ preprocess_conf=config.preprocess_config,
+ batch_size=config.batch_size,
+ num_mel_bins=config.feat_dim,
+ frame_length=config.window_ms,
+ frame_shift=config.stride_ms,
+ dither=config.dither,
+ minlen_in=config.minlen_in,
+ maxlen_in=config.maxlen_in,
+ minlen_out=config.minlen_out,
+ maxlen_out=config.maxlen_out,
+ resample_rate=config.resample_rate,
+ shuffle_size=config.shuffle_size,
+ sort_size=config.sort_size,
+ n_iter_processes=config.num_workers,
+ prefetch_factor=config.prefetch_factor,
+ dist_sampler=config.dist_sampler,
+ cmvn_file=config.cmvn_file,
+ vocab_filepath=config.vocab_filepath,
+ )
else:
if mode == 'train':
config['manifest'] = config.train_manifest
@@ -425,7 +411,7 @@ class DataLoaderFactory():
config['train_mode'] = False
config['sortagrad'] = False
config['batch_size'] = config.get('decode', dict()).get(
- 'decode_batch_size', 1)
+ 'decode_batch_size', 1)
config['maxlen_in'] = float('inf')
config['maxlen_out'] = float('inf')
config['minibatches'] = 0
@@ -441,10 +427,8 @@ class DataLoaderFactory():
config['dist_sampler'] = False
config['shortest_first'] = False
else:
- raise KeyError(
- "not valid mode type!!, please input one of 'train, valid, test, align'"
- )
-
+ raise KeyError("not valid mode type!!, please input one of 'train, valid, test, align'")
+
return BatchDataLoader(
json_file=config.manifest,
train_mode=config.train_mode,
@@ -466,3 +450,4 @@ class DataLoaderFactory():
num_encs=config.num_encs,
dist_sampler=config.dist_sampler,
shortest_first=config.shortest_first)
+
diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py
index 5e018befb..44e452bb0 100644
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@@ -120,6 +120,7 @@ class LoadInputsAndTargets():
x = self._get_from_loader(
filepath=inp["feat"],
filetype=inp.get("filetype", "mat"))
+
x_feats_dict.setdefault(inp["name"], []).append(x)
if self.load_output:
@@ -236,6 +237,7 @@ class LoadInputsAndTargets():
:return:
:rtype: np.ndarray
"""
+
if filetype == "hdf5":
# e.g.
# {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL",
diff --git a/paddlespeech/s2t/models/ds2/deepspeech2.py b/paddlespeech/s2t/models/ds2/deepspeech2.py
index b7ee80a7d..4557af86f 100644
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@@ -271,7 +271,7 @@ class DeepSpeech2Model(nn.Layer):
enc_n_units=self.encoder.output_size,
blank_id=blank_id,
dropout_rate=0.0,
- reduction=True, # sum
+ reduction_type="sum", # sum
batch_average=True, # sum / batch_size
grad_norm_type=ctc_grad_norm_type)
diff --git a/paddlespeech/s2t/models/test.py b/paddlespeech/s2t/models/test.py
new file mode 100644
index 000000000..488c386e1
--- /dev/null
+++ b/paddlespeech/s2t/models/test.py
@@ -0,0 +1,20 @@
+import paddle
+import paddle.nn as nn
+
+class Model(nn.Layer):
+ def __init__(self):
+ super().__init__()
+ self.linear = nn.Linear(1024,1024)
+
+ def forward(self, x):
+ return self.linear(x)
+
+model = Model()
+x = paddle.uniform([100,1024], dtype='float32')
+out = model(x)
+loss = paddle.mean(out)
+loss.backward()
+
+clip = nn.ClipGradByGlobalNorm(clip_norm=1.0)
+optim = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=model.parameters(), grad_clip=clip)
+optim.step()
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 813e1e529..b6a4eb7fa 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -605,8 +605,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
xs: paddle.Tensor,
offset: int,
required_cache_size: int,
- att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
- cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
+ att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Export interface for c++ call, give input chunk xs, and return
output from time 0 to current chunk.
@@ -864,7 +864,7 @@ class U2Model(U2DecodeModel):
enc_n_units=encoder.output_size(),
blank_id=0,
dropout_rate=dropout_rate,
- reduction=True, # sum
+ reduction_type="sum", # sum
batch_average=True, # sum / batch_size
grad_norm_type=grad_norm_type)
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index e8b61bc0d..81ae43184 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -18,6 +18,7 @@ Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recogni
"""
import time
from typing import Dict
+from typing import List
from typing import Optional
from typing import Tuple
@@ -25,8 +26,6 @@ import paddle
from paddle import jit
from paddle import nn
-from paddlespeech.audio.utils.tensor_utils import add_sos_eos
-from paddlespeech.audio.utils.tensor_utils import th_accuracy
from paddlespeech.s2t.frontend.utility import IGNORE_ID
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.modules.cmvn import GlobalCMVN
@@ -39,6 +38,8 @@ from paddlespeech.s2t.modules.mask import subsequent_mask
from paddlespeech.s2t.utils import checkpoint
from paddlespeech.s2t.utils import layer_tools
from paddlespeech.s2t.utils.log import Log
+from paddlespeech.audio.utils.tensor_utils import add_sos_eos
+from paddlespeech.audio.utils.tensor_utils import th_accuracy
from paddlespeech.s2t.utils.utility import UpdateConfig
__all__ = ["U2STModel", "U2STInferModel"]
@@ -400,8 +401,8 @@ class U2STBaseModel(nn.Layer):
xs: paddle.Tensor,
offset: int,
required_cache_size: int,
- att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
- cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]),
+ cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0]),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Export interface for c++ call, give input chunk xs, and return
output from time 0 to current chunk.
@@ -434,8 +435,8 @@ class U2STBaseModel(nn.Layer):
paddle.Tensor: new conformer cnn cache required for next chunk, with
same shape as the original cnn_cache.
"""
- return self.encoder.forward_chunk(xs, offset, required_cache_size,
- att_cache, cnn_cache)
+ return self.encoder.forward_chunk(
+ xs, offset, required_cache_size, att_cache, cnn_cache)
# @jit.to_static
def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
@@ -611,7 +612,7 @@ class U2STModel(U2STBaseModel):
enc_n_units=encoder.output_size(),
blank_id=0,
dropout_rate=dropout_rate,
- reduction=True, # sum
+ reduction_type='sum', # sum
batch_average=True, # sum / batch_size
grad_norm_type=grad_norm_type)
diff --git a/paddlespeech/s2t/models/wav2vec2/__init__.py b/paddlespeech/s2t/models/wav2vec2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/s2t/models/wav2vec2/activations.py b/paddlespeech/s2t/models/wav2vec2/activations.py
new file mode 100644
index 000000000..0158e8cb0
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/activations.py
@@ -0,0 +1,175 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from packaging import version
+from paddle import Tensor, nn
+
+
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
+
+class NewGELUActivation(nn.Layer):
+ """
+ Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+ the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+ """
+
+ def forward(self, input: Tensor) -> Tensor:
+ return 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0))))
+
+
+class GELUActivation(nn.Layer):
+ """
+ Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+ information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+ torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+ Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+ """
+
+ def __init__(self, use_gelu_python: bool = False):
+ super().__init__()
+ self.act = nn.functional.gelu
+
+ def _gelu_python(self, input: Tensor) -> Tensor:
+ return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+ def forward(self, input: Tensor) -> Tensor:
+ return self.act(input)
+
+
+class FastGELUActivation(nn.Layer):
+ """
+ Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
+ """
+
+ def forward(self, input: Tensor) -> Tensor:
+ return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input)))
+
+
+class QuickGELUActivation(nn.Layer):
+ """
+ Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
+ """
+
+ def forward(self, input: Tensor) -> Tensor:
+ return input * paddle.sigmoid(1.702 * input)
+
+
+class ClippedGELUActivation(nn.Layer):
+ """
+ Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
+ it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
+ https://arxiv.org/abs/2004.09602.
+
+ Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
+ initially created.
+
+ For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+ torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
+ """
+
+ def __init__(self, min: float, max: float):
+ if min > max:
+ raise ValueError(f"min should be < max (got min: {min}, max: {max})")
+
+ super().__init__()
+ self.min = min
+ self.max = max
+
+ def forward(self, x: Tensor) -> Tensor:
+ return paddle.clip(gelu(x), self.min, self.max)
+
+
+class SiLUActivation(nn.Layer):
+ """
+ See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+ Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+ Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+ Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+ later.
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.act = nn.functional.silu
+
+ def _silu_python(self, input: Tensor) -> Tensor:
+ return input * paddle.sigmoid(input)
+
+ def forward(self, input: Tensor) -> Tensor:
+ return self.act(input)
+
+
+class MishActivation(nn.Layer):
+ """
+ See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+ visit the official repository for the paper: https://github.com/digantamisra98/Mish
+ """
+
+ def __init__(self):
+ super().__init__()
+ self.act = nn.functional.mish
+
+ def _mish_python(self, input: Tensor) -> Tensor:
+ return input * paddle.tanh(nn.functional.softplus(input))
+
+ def forward(self, input: Tensor) -> Tensor:
+ return self.act(input)
+
+
+class LinearActivation(nn.Layer):
+ """
+ Applies the linear activation function, i.e. forwarding input directly to output.
+ """
+
+ def forward(self, input: Tensor) -> Tensor:
+ return input
+
+
+ACT2FN = {
+ "gelu": GELUActivation(),
+ "gelu_10": ClippedGELUActivation(-10, 10),
+ "gelu_fast": FastGELUActivation(),
+ "gelu_new": NewGELUActivation(),
+ "gelu_python": GELUActivation(use_gelu_python=True),
+ "linear": LinearActivation(),
+ "mish": MishActivation(),
+ "quick_gelu": QuickGELUActivation(),
+ "relu": nn.ReLU(),
+ "sigmoid": nn.Sigmoid(),
+ "silu": SiLUActivation(),
+ "swish": SiLUActivation(),
+ "tanh": nn.Tanh(),
+}
+
+
+def get_activation(activation_string):
+ if activation_string in ACT2FN:
+ return ACT2FN[activation_string]
+ else:
+ raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
+
+
+# For backwards compatibility with: from activations import gelu_python
+gelu_python = get_activation("gelu_python")
+gelu_new = get_activation("gelu_new")
+gelu = get_activation("gelu")
+gelu_fast = get_activation("gelu_fast")
+quick_gelu = get_activation("quick_gelu")
+silu = get_activation("silu")
+mish = get_activation("mish")
+linear_act = get_activation("linear")
diff --git a/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py b/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py
new file mode 100644
index 000000000..a5b509b66
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/modeling_outputs.py
@@ -0,0 +1,1129 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+from collections import OrderedDict
+
+from dataclasses import fields
+import paddle
+
+
+class ModelOutput(OrderedDict):
+ """
+ Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a
+ tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular
+ python dictionary.
+
+
+
+ You can't unpack a `ModelOutput` directly. Use the [`~utils.ModelOutput.to_tuple`] method to convert it to a tuple
+ before.
+
+
+ """
+
+ def __post_init__(self):
+ class_fields = fields(self)
+
+ # Safety and consistency checks
+ if not len(class_fields):
+ raise ValueError(f"{self.__class__.__name__} has no fields.")
+ if not all(field.default is None for field in class_fields[1:]):
+ raise ValueError(f"{self.__class__.__name__} should not have more than one required field.")
+
+ first_field = getattr(self, class_fields[0].name)
+ other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+ if other_fields_are_none and not paddle.is_tensor(first_field):
+ if isinstance(first_field, dict):
+ iterator = first_field.items()
+ first_field_iterator = True
+ else:
+ try:
+ iterator = iter(first_field)
+ first_field_iterator = True
+ except TypeError:
+ first_field_iterator = False
+
+ # if we provided an iterator as first field and the iterator is a (key, value) iterator
+ # set the associated fields
+ if first_field_iterator:
+ for element in iterator:
+ if (
+ not isinstance(element, (list, tuple))
+ or not len(element) == 2
+ or not isinstance(element[0], str)
+ ):
+ break
+ setattr(self, element[0], element[1])
+ if element[1] is not None:
+ self[element[0]] = element[1]
+ elif first_field is not None:
+ self[class_fields[0].name] = first_field
+ else:
+ for field in class_fields:
+ v = getattr(self, field.name)
+ if v is not None:
+ self[field.name] = v
+
+ def __delitem__(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+ def setdefault(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+ def pop(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+ def update(self, *args, **kwargs):
+ raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+ def __getitem__(self, k):
+ if isinstance(k, str):
+ inner_dict = {k: v for (k, v) in self.items()}
+ return inner_dict[k]
+ else:
+ return self.to_tuple()[k]
+
+ def __setattr__(self, name, value):
+ if name in self.keys() and value is not None:
+ # Don't call self.__setitem__ to avoid recursion errors
+ super().__setitem__(name, value)
+ super().__setattr__(name, value)
+
+ def __setitem__(self, key, value):
+ # Will raise a KeyException if needed
+ super().__setitem__(key, value)
+ # Don't call self.__setattr__ to avoid recursion errors
+ super().__setattr__(key, value)
+
+ def to_tuple(self) -> Tuple:
+ """
+ Convert self to a tuple containing all the attributes/keys that are not `None`.
+ """
+ return tuple(self[k] for k in self.keys())
+
+
+@dataclass
+class BaseModelOutput(ModelOutput):
+ """
+ Base class for model's outputs, with potential hidden states and attentions.
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithNoAttention(ModelOutput):
+ """
+ Base class for model's outputs, with potential hidden states.
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ """
+
+ last_hidden_state: paddle = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPooling(ModelOutput):
+ """
+ Base class for model's outputs that also contains a pooling of the last hidden states.
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+ Last layer hidden-state of the first token of the sequence (classification token) after further processing
+ through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+ the classification token after processing through a linear layer and a tanh activation function. The linear
+ layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ pooler_output: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndNoAttention(ModelOutput):
+ """
+ Base class for model's outputs that also contains a pooling of the last hidden states.
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, num_channels, height, width)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+ Last layer hidden-state after a pooling operation on the spatial dimensions.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ pooler_output: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPast(ModelOutput):
+ """
+ Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+ hidden_size)` is output.
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+ encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+ input) to speed up sequential decoding.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithCrossAttentions(ModelOutput):
+ """
+ Base class for model's outputs, with potential hidden states and attentions.
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+ cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
+ """
+ Base class for model's outputs that also contains a pooling of the last hidden states.
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ pooler_output (`paddle.Tensor` of shape `(batch_size, hidden_size)`):
+ Last layer hidden-state of the first token of the sequence (classification token) after further processing
+ through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+ the classification token after processing through a linear layer and a tanh activation function. The linear
+ layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+ encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+ input) to speed up sequential decoding.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ pooler_output: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+ cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
+ """
+ Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+ hidden_size)` is output.
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+ encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+ input) to speed up sequential decoding.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+ cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqModelOutput(ModelOutput):
+ """
+ Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+ decoding.
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+ hidden_size)` is output.
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+ decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+ encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+ cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+ encoder_last_hidden_state: Optional[paddle.Tensor] = None
+ encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CausalLMOutput(ModelOutput):
+ """
+ Base class for causal language model (or autoregressive) outputs.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss (for next-token prediction).
+ logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CausalLMOutputWithPast(ModelOutput):
+ """
+ Base class for causal language model (or autoregressive) outputs.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss (for next-token prediction).
+ logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class CausalLMOutputWithCrossAttentions(ModelOutput):
+ """
+ Base class for causal language model (or autoregressive) outputs.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss (for next-token prediction).
+ logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Cross attentions weights after the attention softmax, used to compute the weighted average in the
+ cross-attention heads.
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `paddle.Tensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
+ value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
+ setting. Only relevant if `config.is_decoder = True`.
+
+ Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+ cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SequenceClassifierOutputWithPast(ModelOutput):
+ """
+ Base class for outputs of sentence classification models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+ `past_key_values` input) to speed up sequential decoding.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MaskedLMOutput(ModelOutput):
+ """
+ Base class for masked language models outputs.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Masked language modeling (MLM) loss.
+ logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqLMOutput(ModelOutput):
+ """
+ Base class for sequence-to-sequence language models outputs.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss.
+ logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+ decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+ cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+ encoder_last_hidden_state: Optional[paddle.Tensor] = None
+ encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class NextSentencePredictorOutput(ModelOutput):
+ """
+ Base class for outputs of models predicting if two sentences are consecutive or not.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `next_sentence_label` is provided):
+ Next sequence prediction (classification) loss.
+ logits (`paddle.Tensor` of shape `(batch_size, 2)`):
+ Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+ before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SequenceClassifierOutput(ModelOutput):
+ """
+ Base class for outputs of sentence classification models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqSequenceClassifierOutput(ModelOutput):
+ """
+ Base class for outputs of sequence-to-sequence sentence classification models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+ decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+ cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+ encoder_last_hidden_state: Optional[paddle.Tensor] = None
+ encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class MultipleChoiceModelOutput(ModelOutput):
+ """
+ Base class for outputs of multiple choice models.
+
+ Args:
+ loss (`paddle.Tensor` of shape *(1,)*, *optional*, returned when `labels` is provided):
+ Classification loss.
+ logits (`paddle.Tensor` of shape `(batch_size, num_choices)`):
+ *num_choices* is the second dimension of the input tensors. (see *input_ids* above).
+
+ Classification scores (before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class TokenClassifierOutput(ModelOutput):
+ """
+ Base class for outputs of token classification models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided) :
+ Classification loss.
+ logits (`paddle.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+ Classification scores (before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class QuestionAnsweringModelOutput(ModelOutput):
+ """
+ Base class for outputs of question answering models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+ start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+ Span-start scores (before SoftMax).
+ end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+ Span-end scores (before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ start_logits: paddle.Tensor = None
+ end_logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
+ """
+ Base class for outputs of sequence-to-sequence question answering models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+ start_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+ Span-start scores (before SoftMax).
+ end_logits (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+ Span-end scores (before SoftMax).
+ past_key_values (`tuple(tuple(paddle.Tensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(paddle.Tensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+ decoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+ decoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ cross_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+ weighted average in the cross-attention heads.
+ encoder_last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+ encoder_attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ start_logits: paddle.Tensor = None
+ end_logits: paddle.Tensor = None
+ past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]] = None
+ decoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ decoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+ cross_attentions: Optional[Tuple[paddle.Tensor]] = None
+ encoder_last_hidden_state: Optional[paddle.Tensor] = None
+ encoder_hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ encoder_attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class SemanticSegmenterOutput(ModelOutput):
+ """
+ Base class for outputs of semantic segmentation models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ logits (`paddle.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
+ Classification scores for each pixel.
+
+
+
+ The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
+ to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
+ original image size as post-processing. You should always check your logits shape and resize as needed.
+
+
+
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ImageClassifierOutput(ModelOutput):
+ """
+ Base class for outputs of image classification models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
+ (also called feature maps) of the model at the output of each stage.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class ImageClassifierOutputWithNoAttention(ModelOutput):
+ """
+ Base class for outputs of image classification models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ logits (`paddle.Tensor` of shape `(batch_size, config.num_labels)`):
+ Classification (or regression if config.num_labels==1) scores (before SoftMax).
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also
+ called feature maps) of the model at the output of each stage.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class DepthEstimatorOutput(ModelOutput):
+ """
+ Base class for outputs of depth estimation models.
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification (or regression if config.num_labels==1) loss.
+ predicted_depth (`paddle.Tensor` of shape `(batch_size, height, width)`):
+ Predicted depth for each pixel.
+
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, num_channels, height, width)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ predicted_depth: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class Wav2Vec2BaseModelOutput(ModelOutput):
+ """
+ Base class for models that have been trained with the Wav2Vec2 loss objective.
+
+ Args:
+ last_hidden_state (`paddle.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ extract_features (`paddle.Tensor` of shape `(batch_size, sequence_length, conv_dim[-1])`):
+ Sequence of extracted feature vectors of the last convolutional layer of the model.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ last_hidden_state: paddle.Tensor = None
+ extract_features: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+
+
+@dataclass
+class XVectorOutput(ModelOutput):
+ """
+ Output type of [`Wav2Vec2ForXVector`].
+
+ Args:
+ loss (`paddle.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Classification loss.
+ logits (`paddle.Tensor` of shape `(batch_size, config.xvector_output_dim)`):
+ Classification hidden states before AMSoftmax.
+ embeddings (`paddle.Tensor` of shape `(batch_size, config.xvector_output_dim)`):
+ Utterance embeddings used for vector similarity-based retrieval.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ logits: paddle.Tensor = None
+ embeddings: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
diff --git a/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py b/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py
new file mode 100755
index 000000000..5accff120
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/modeling_wav2vec2.py
@@ -0,0 +1,1259 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Wav2Vec2 model."""
+
+import math
+import warnings
+import paddle
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+from paddle import nn
+
+from paddlespeech.s2t.models.wav2vec2.activations import ACT2FN
+from paddlespeech.s2t.models.wav2vec2.modeling_outputs import (
+ BaseModelOutput,
+ Wav2Vec2BaseModelOutput,
+ ModelOutput
+)
+
+
+from paddlespeech.s2t.utils.log import Log
+logger = Log(__name__).getlog()
+
+
+@dataclass
+class Wav2Vec2ForPreTrainingOutput(ModelOutput):
+ """
+ Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions.
+
+ Args:
+ loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`):
+ Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+ paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+ projected_states (`paddle.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+ Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+ projected quantized states.
+ projected_quantized_states (`paddle.Tensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+ Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+ target vectors for contrastive loss.
+ hidden_states (`tuple(paddle.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `paddle.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+ attentions (`tuple(paddle.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `paddle.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`):
+ The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+ diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `paddle.Tensor` of shape `(1,)`):
+ The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+ """
+
+ loss: Optional[paddle.Tensor] = None
+ projected_states: paddle.Tensor = None
+ projected_quantized_states: paddle.Tensor = None
+ codevector_perplexity: paddle.Tensor = None
+ hidden_states: Optional[Tuple[paddle.Tensor]] = None
+ attentions: Optional[Tuple[paddle.Tensor]] = None
+ contrastive_loss: Optional[paddle.Tensor] = None
+ diversity_loss: Optional[paddle.Tensor] = None
+
+
+def _compute_mask_indices(
+ shape: Tuple[int, int],
+ mask_prob: float,
+ mask_length: int,
+ attention_mask: Optional[paddle.Tensor] = None,
+ min_masks: int = 0,
+) -> np.ndarray:
+ """
+ Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+ ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+ CPU as part of the preprocessing during training.
+
+ Args:
+ shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+ the first element is the batch size and the second element is the length of the axis to span.
+ mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+ independently generated mask spans of length `mask_length` is computed by
+ `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+ actual percentage will be smaller.
+ mask_length: size of the mask
+ min_masks: minimum number of masked spans
+ attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+ each batch dimension.
+ """
+ batch_size, sequence_length = shape
+
+ if mask_length < 1:
+ raise ValueError("`mask_length` has to be bigger than 0.")
+
+ if mask_length > sequence_length:
+ raise ValueError(
+ f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+ f" and `sequence_length`: {sequence_length}`"
+ )
+
+ # epsilon is used for probabilistic rounding
+ epsilon = np.random.rand(1).item()
+
+ def compute_num_masked_span(input_length):
+ """Given input length, compute how many spans should be masked"""
+ num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+ num_masked_span = max(num_masked_span, min_masks)
+
+ # make sure num masked span <= sequence_length
+ if num_masked_span * mask_length > sequence_length:
+ num_masked_span = sequence_length // mask_length
+
+ # make sure num_masked span is also <= input_length - (mask_length - 1)
+ if input_length - (mask_length - 1) < num_masked_span:
+ num_masked_span = max(input_length - (mask_length - 1), 0)
+
+ return num_masked_span
+
+ # compute number of masked spans in batch
+ input_lengths = (
+ attention_mask.sum(-1).detach().tolist()
+ if attention_mask is not None
+ else [sequence_length for _ in range(batch_size)]
+ )
+
+ # SpecAugment mask to fill
+ spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=np.bool)
+ spec_aug_mask_idxs = []
+
+ max_num_masked_span = compute_num_masked_span(sequence_length)
+
+ if max_num_masked_span == 0:
+ return spec_aug_mask
+
+ for input_length in input_lengths:
+ # compute num of masked spans for this input
+ num_masked_span = compute_num_masked_span(input_length)
+
+ # get random indices to mask
+ spec_aug_mask_idx = np.random.choice(
+ np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+ )
+
+ # pick first sampled index that will serve as a dummy index to pad vector
+ # to ensure same dimension for all batches due to probabilistic rounding
+ # Picking first sample just pads those vectors twice.
+ if len(spec_aug_mask_idx) == 0:
+ # this case can only happen if `input_length` is strictly smaller then
+ # `sequence_length` in which case the last token has to be a padding
+ # token which we can use as a dummy mask id
+ dummy_mask_idx = sequence_length - 1
+ else:
+ dummy_mask_idx = spec_aug_mask_idx[0]
+
+ spec_aug_mask_idx = np.concatenate(
+ [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+ )
+ spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+ spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+ # expand masked indices to masked spans
+ spec_aug_mask_idxs = np.broadcast_to(
+ spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+ )
+ spec_aug_mask_idxs = spec_aug_mask_idxs.reshape((batch_size, max_num_masked_span * mask_length))
+
+ # add offset to the starting indexes so that indexes now create a span
+ offsets = np.arange(mask_length)[None, None, :]
+ offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+ (batch_size, max_num_masked_span * mask_length)
+ )
+ spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+ # ensure that we cannot have indices larger than sequence_length
+ if spec_aug_mask_idxs.max() > sequence_length - 1:
+ spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+ # scatter indices to mask
+ np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+ return spec_aug_mask
+
+
+def _sample_negative_indices(
+ features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
+):
+ """
+ Sample `num_negatives` vectors from feature vectors.
+ """
+ batch_size, sequence_length = features_shape
+
+ # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+ sequence_length_range = np.arange(sequence_length)
+
+ # get `num_negatives` random vector indices from the same utterance
+ sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
+
+ mask_time_indices = (
+ mask_time_indices.astype(np.bool) if mask_time_indices is not None else np.ones(features_shape, dtype=np.bool)
+ )
+
+ for batch_idx in range(batch_size):
+ high = mask_time_indices[batch_idx].sum() - 1
+ mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+
+ feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
+ sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
+ # avoid sampling the same positive vector, but keep the distribution uniform
+ sampled_indices[sampled_indices >= feature_indices] += 1
+
+ # remap to actual indices
+ sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+
+ # correct for batch size
+ sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+
+ return sampled_negative_indices
+
+
+class Wav2Vec2NoLayerNormConvLayer(nn.Layer):
+ def __init__(self, config, layer_id=0):
+ super().__init__()
+ self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+ self.out_conv_dim = config.conv_dim[layer_id]
+
+ self.conv = nn.Conv1D(
+ self.in_conv_dim,
+ self.out_conv_dim,
+ kernel_size=config.conv_kernel[layer_id],
+ stride=config.conv_stride[layer_id],
+ bias_attr=config.conv_bias,
+ )
+ self.activation = ACT2FN[config.feat_extract_activation]
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+ hidden_states = self.activation(hidden_states)
+ return hidden_states
+
+
+class Wav2Vec2LayerNormConvLayer(nn.Layer):
+ def __init__(self, config, layer_id=0):
+ super().__init__()
+ self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+ self.out_conv_dim = config.conv_dim[layer_id]
+
+ self.conv = nn.Conv1D(
+ self.in_conv_dim,
+ self.out_conv_dim,
+ kernel_size=config.conv_kernel[layer_id],
+ stride=config.conv_stride[layer_id],
+ bias_attr=config.conv_bias,
+ )
+ self.layer_norm = nn.LayerNorm(self.out_conv_dim)
+ self.activation = ACT2FN[config.feat_extract_activation]
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+ hidden_states = hidden_states.transpose([0, 2, 1])
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = hidden_states.transpose([0, 2, 1])
+
+ hidden_states = self.activation(hidden_states)
+ return hidden_states
+
+
+class Wav2Vec2GroupNormConvLayer(nn.Layer):
+ def __init__(self, config, layer_id=0):
+ super().__init__()
+ self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+ self.out_conv_dim = config.conv_dim[layer_id]
+
+ self.conv = nn.Conv1D(
+ self.in_conv_dim,
+ self.out_conv_dim,
+ kernel_size=config.conv_kernel[layer_id],
+ stride=config.conv_stride[layer_id],
+ bias_attr=config.conv_bias,
+ )
+ self.activation = ACT2FN[config.feat_extract_activation]
+
+ self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim)
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = self.activation(hidden_states)
+ return hidden_states
+
+
+class Wav2Vec2PositionalConvEmbedding(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.conv = nn.Conv1D(
+ config.hidden_size,
+ config.hidden_size,
+ kernel_size=config.num_conv_pos_embeddings,
+ padding=config.num_conv_pos_embeddings // 2,
+ groups=config.num_conv_pos_embedding_groups,
+ )
+
+ self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+
+ self.padding = Wav2Vec2SamePadLayer(config.num_conv_pos_embeddings)
+ self.activation = ACT2FN[config.feat_extract_activation]
+
+ def forward(self, hidden_states):
+ hidden_states = hidden_states.transpose([0, 2, 1])
+
+ hidden_states = self.conv(hidden_states)
+ hidden_states = self.padding(hidden_states)
+ hidden_states = self.activation(hidden_states)
+
+ hidden_states = hidden_states.transpose([0, 2, 1])
+ return hidden_states
+
+
+class Wav2Vec2SamePadLayer(nn.Layer):
+ def __init__(self, num_conv_pos_embeddings):
+ super().__init__()
+ self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+ def forward(self, hidden_states):
+ if self.num_pad_remove > 0:
+ hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+ return hidden_states
+
+
+class Wav2Vec2FeatureEncoder(nn.Layer):
+ """Construct the features from raw audio waveform"""
+
+ def __init__(self, config):
+ super().__init__()
+
+ if config.feat_extract_norm == "group":
+ conv_layers = [Wav2Vec2GroupNormConvLayer(config, layer_id=0)] + [
+ Wav2Vec2NoLayerNormConvLayer(config, layer_id=i + 1) for i in range(config.num_feat_extract_layers - 1)
+ ]
+ elif config.feat_extract_norm == "layer":
+ conv_layers = [
+ Wav2Vec2LayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+ ]
+ else:
+ raise ValueError(
+ f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+ )
+ self.conv_layers = nn.LayerList(conv_layers)
+ self.gradient_checkpointing = False
+ self._requires_grad = True
+
+ def _freeze_parameters(self):
+ for param in self.parameters():
+ param.requires_grad = False
+ self._requires_grad = False
+
+ def forward(self, input_values):
+ hidden_states = input_values[:, None]
+
+ # make sure hidden_states require grad for gradient_checkpointing
+ #if self._requires_grad and self.training:
+ # hidden_states.requires_grad = True
+
+ for conv_layer in self.conv_layers:
+ hidden_states = conv_layer(hidden_states)
+
+ return hidden_states
+
+
+class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder):
+ def __init__(self, config):
+ super().__init__(config)
+ warnings.warn(
+ f"The class `{self.__class__.__name__}` has been depreciated "
+ "and will be removed in Transformers v5. "
+ f"Use `{self.__class__.__bases__[0].__name__}` instead.",
+ FutureWarning,
+ )
+
+
+class Wav2Vec2FeatureProjection(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.layer_norm = nn.LayerNorm(config.conv_dim[-1], epsilon=config.layer_norm_eps)
+ self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+ self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+ def forward(self, hidden_states):
+ # non-projected hidden states are needed for quantization
+ norm_hidden_states = self.layer_norm(hidden_states)
+ hidden_states = self.projection(norm_hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ return hidden_states, norm_hidden_states
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Wav2Vec2
+class Wav2Vec2Attention(nn.Layer):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ def __init__(
+ self,
+ embed_dim: int,
+ num_heads: int,
+ dropout: float = 0.0,
+ is_decoder: bool = False,
+ bias: bool = True,
+ ):
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = embed_dim // num_heads
+
+ if (self.head_dim * num_heads) != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+ f" and `num_heads`: {num_heads})."
+ )
+ self.scaling = self.head_dim**-0.5
+ self.is_decoder = is_decoder
+
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias_attr=bias)
+
+ def _shape(self, tensor: paddle.Tensor, seq_len: int, bsz: int):
+ return paddle.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)).transpose([0, 2, 1, 3])
+
+ def forward(
+ self,
+ hidden_states: paddle.Tensor,
+ key_value_states: Optional[paddle.Tensor] = None,
+ past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ layer_head_mask: Optional[paddle.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+
+ # if key_value_states are provided this layer is used as a cross-attention layer
+ # for the decoder
+ is_cross_attention = key_value_states is not None
+
+ bsz, tgt_len, _ = hidden_states.shape
+
+ # get query proj
+ query_states = self.q_proj(hidden_states) * self.scaling
+ # get key, value proj
+ if is_cross_attention and past_key_value is not None:
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0]
+ value_states = past_key_value[1]
+ elif is_cross_attention:
+ # cross_attentions
+ key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+ value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+ elif past_key_value is not None:
+ # reuse k, v, self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+ key_states = paddle.concat([past_key_value[0], key_states], axis=2)
+ value_states = paddle.concat([past_key_value[1], value_states], axis=2)
+ else:
+ # self_attention
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(paddle.Tensor, paddle.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(paddle.Tensor, paddle.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states, value_states)
+
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+ query_states = self._shape(query_states, tgt_len, bsz).reshape(proj_shape)
+ key_states = key_states.reshape(proj_shape)
+ value_states = value_states.reshape(proj_shape)
+
+ src_len = key_states.shape[1]
+ attn_weights = paddle.bmm(query_states, key_states.transpose([0, 2, 1]))
+
+
+ if attn_weights.shape != [bsz * self.num_heads, tgt_len, src_len]:
+ raise ValueError(
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+ f" {attn_weights.shape}"
+ )
+
+ if attention_mask is not None:
+ if attention_mask.shape != [bsz, 1, tgt_len, src_len]:
+ raise ValueError(
+ f"Attention mask should be of size {[bsz, 1, tgt_len, src_len]}, but is {attention_mask.shape}"
+ )
+ attn_weights = attn_weights.reshape(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+ attn_weights = attn_weights.reshape(bsz * self.num_heads, tgt_len, src_len)
+
+ attn_weights = nn.functional.softmax(attn_weights, axis=- 1)
+
+ if layer_head_mask is not None:
+ if layer_head_mask.shape != [self.num_heads,]:
+ raise ValueError(
+ f"Head mask for a single layer should be of size {[self.num_heads,]}, but is"
+ f" {layer_head_mask.shape}"
+ )
+ attn_weights = layer_head_mask.reshape((1, -1, 1, 1)) * attn_weights.reshape((bsz, self.num_heads, tgt_len, src_len))
+ attn_weights = attn_weights.reshape((bsz * self.num_heads, tgt_len, src_len))
+
+ if output_attentions:
+ # this operation is a bit awkward, but it's required to
+ # make sure that attn_weights keeps its gradient.
+ # In order to do so, attn_weights have to be reshaped
+ # twice and have to be reused in the following
+ attn_weights_reshaped = attn_weights.reshape((bsz, self.num_heads, tgt_len, src_len))
+ attn_weights = attn_weights_reshaped.reshape((bsz * self.num_heads, tgt_len, src_len))
+ else:
+ attn_weights_reshaped = None
+
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+ attn_output = paddle.bmm(attn_probs, value_states)
+
+ if attn_output.shape != [bsz * self.num_heads, tgt_len, self.head_dim]:
+ raise ValueError(
+ f"`attn_output` should be of size {[bsz, self.num_heads, tgt_len, self.head_dim]}, but is"
+ f" {attn_output.shape}"
+ )
+
+ attn_output = attn_output.reshape((bsz, self.num_heads, tgt_len, self.head_dim))
+ attn_output = attn_output.transpose([0, 2, 1, 3])
+
+ # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+ # partitioned aross GPUs when using tensor-parallelism.
+ attn_output = attn_output.reshape((bsz, tgt_len, self.embed_dim))
+
+ attn_output = self.out_proj(attn_output)
+
+ return attn_output, attn_weights_reshaped, past_key_value
+
+
+class Wav2Vec2FeedForward(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+ self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+ self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+ def forward(self, hidden_states):
+ hidden_states = self.intermediate_dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ hidden_states = self.intermediate_dropout(hidden_states)
+
+ hidden_states = self.output_dense(hidden_states)
+ hidden_states = self.output_dropout(hidden_states)
+ return hidden_states
+
+
+class Wav2Vec2EncoderLayer(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.attention = Wav2Vec2Attention(
+ embed_dim=config.hidden_size,
+ num_heads=config.num_attention_heads,
+ dropout=config.attention_dropout,
+ is_decoder=False,
+ )
+ self.dropout = nn.Dropout(config.hidden_dropout)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+ self.feed_forward = Wav2Vec2FeedForward(config)
+ self.final_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+ attn_residual = hidden_states
+ hidden_states, attn_weights, _ = self.attention(
+ hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+ )
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = attn_residual + hidden_states
+
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = hidden_states + self.feed_forward(hidden_states)
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+class Wav2Vec2EncoderLayerStableLayerNorm(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.attention = Wav2Vec2Attention(
+ embed_dim=config.hidden_size,
+ num_heads=config.num_attention_heads,
+ dropout=config.attention_dropout,
+ is_decoder=False,
+ )
+ self.dropout = nn.Dropout(config.hidden_dropout)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+ self.feed_forward = Wav2Vec2FeedForward(config)
+ self.final_layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+
+ def forward(self, hidden_states, attention_mask=None, output_attentions=False):
+ attn_residual = hidden_states
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states, attn_weights, _ = self.attention(
+ hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+ )
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = attn_residual + hidden_states
+ hidden_states = hidden_states + self.feed_forward(self.final_layer_norm(hidden_states))
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+class Wav2Vec2Encoder(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout)
+ self.layers = nn.LayerList([Wav2Vec2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ ):
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ if attention_mask is not None:
+ # make sure padded tokens output 0
+ expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+ hidden_states[~expand_attention_mask] = 0
+
+ # extend attention_mask
+ attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+ attention_mask = attention_mask * np.iinfo(np.float32).min #torch.finfo(hidden_states.dtype).min
+ attention_mask = attention_mask.expand(
+ attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+ )
+
+ position_embeddings = self.pos_conv_embed(hidden_states)
+ hidden_states = hidden_states + position_embeddings
+ hidden_states = self.layer_norm(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+
+ #deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+ for layer in self.layers:
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ dropout_probability = np.random.uniform(0, 1)
+
+ skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+ if not skip_the_layer:# or deepspeed_zero3_is_enabled:
+ # under deepspeed zero3 all gpus must run in sync
+ if self.gradient_checkpointing and self.training:
+ # create gradient checkpointing function
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs, output_attentions)
+
+ return custom_forward
+ else:
+ layer_outputs = layer(
+ hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+ )
+ hidden_states = layer_outputs[0]
+
+ if skip_the_layer:
+ layer_outputs = (None, None)
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+class Wav2Vec2EncoderStableLayerNorm(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.pos_conv_embed = Wav2Vec2PositionalConvEmbedding(config)
+ self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+ self.dropout = nn.Dropout(config.hidden_dropout)
+ self.layers = nn.LayerList(
+ [Wav2Vec2EncoderLayerStableLayerNorm(config) for _ in range(config.num_hidden_layers)]
+ )
+ self.gradient_checkpointing = False
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ ):
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ if attention_mask is not None:
+ # make sure padded tokens are not attended to
+ expand_attention_mask = attention_mask.unsqueeze(-1).repeat_interleave(hidden_states.shape[2], axis=2)
+ hidden_states[~expand_attention_mask] = 0
+
+ # extend attention_mask
+ attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+ attention_mask = attention_mask * np.iinfo(np.float32).min # torch.finfo(hidden_states.dtype).min
+ attention_mask = attention_mask.expand(
+ attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+ )
+
+ position_embeddings = self.pos_conv_embed(hidden_states)
+ hidden_states = hidden_states + position_embeddings
+ hidden_states = self.dropout(hidden_states)
+
+ #deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+ for layer in self.layers:
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ dropout_probability = np.random.uniform(0, 1)
+
+ skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+ if not skip_the_layer:# or deepspeed_zero3_is_enabled:
+ # under deepspeed zero3 all gpus must run in sync
+ # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
+ if self.gradient_checkpointing and self.training:
+ # create gradient checkpointing function
+ def create_custom_forward(module):
+ def custom_forward(*inputs):
+ return module(*inputs, output_attentions)
+
+ return custom_forward
+ else:
+ layer_outputs = layer(
+ hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
+ )
+ hidden_states = layer_outputs[0]
+
+ if skip_the_layer:
+ layer_outputs = (None, None)
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ hidden_states = self.layer_norm(hidden_states)
+
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+class Wav2Vec2GumbelVectorQuantizer(nn.Layer):
+ """
+ Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+ GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+ """
+
+ def __init__(self, config):
+ super().__init__()
+ self.num_groups = config.num_codevector_groups
+ self.num_vars = config.num_codevectors_per_group
+
+ if config.codevector_dim % self.num_groups != 0:
+ raise ValueError(
+ f"`config.codevector_dim {config.codevector_dim} must be divisible "
+ f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+ )
+
+ # storage for codebook variables (codewords)
+ self.codevectors = paddle.static.create_parameter(
+ shape=[1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups], dtype='float32'
+ )
+ self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+ # can be decayed for training
+ self.temperature = 2
+
+ @staticmethod
+ def _compute_perplexity(probs, mask=None):
+ if mask is not None:
+ mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+ probs = paddle.where(mask_extended, probs, paddle.zeros_like(probs))
+ marginal_probs = probs.sum(dim=0) / mask.sum()
+ else:
+ marginal_probs = probs.mean(dim=0)
+
+ perplexity = paddle.exp(-paddle.sum(marginal_probs * paddle.log(marginal_probs + 1e-7), dim=-1)).sum()
+ return perplexity
+
+ def forward(self, hidden_states, mask_time_indices=None):
+ batch_size, sequence_length, hidden_size = hidden_states.shape
+
+ # project to codevector dim
+ hidden_states = self.weight_proj(hidden_states)
+ hidden_states = hidden_states.reshape((batch_size * sequence_length * self.num_groups, -1))
+
+ if self.training:
+ # sample code vector probs via gumbel in differentiateable way
+ codevector_probs = nn.functional.gumbel_softmax(
+ hidden_states.float(), tau=self.temperature, hard=True
+ ).type_as(hidden_states)
+
+ # compute perplexity
+ codevector_soft_dist = paddle.softmax(
+ hidden_states.reshape((batch_size * sequence_length, self.num_groups, -1)).float(), axis=-1
+ )
+ perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+ else:
+ # take argmax in non-differentiable way
+ # comptute hard codevector distribution (one hot)
+ codevector_idx = hidden_states.argmax(dim=-1)
+ codevector_probs = hidden_states.new_zeros(*hidden_states.shape).scatter_(
+ -1, codevector_idx.reshape((-1, 1)), 1.0
+ )
+ codevector_probs = codevector_probs.reshape((batch_size * sequence_length, self.num_groups, -1))
+
+ perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+
+ codevector_probs = codevector_probs.reshape((batch_size * sequence_length, -1))
+ # use probs to retrieve codevectors
+ codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+ codevectors = codevectors_per_group.reshape((batch_size * sequence_length, self.num_groups, self.num_vars, -1))
+ codevectors = codevectors.sum(-2).reshape((batch_size, sequence_length, -1))
+
+ return codevectors, perplexity
+
+
+class Wav2Vec2Adapter(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+
+ # feature dim might need to be down-projected
+ if config.output_hidden_size != config.hidden_size:
+ self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+ self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+ else:
+ self.proj = self.proj_layer_norm = None
+
+ self.layers = nn.LayerList(Wav2Vec2AdapterLayer(config) for _ in range(config.num_adapter_layers))
+ self.layerdrop = config.layerdrop
+
+ def forward(self, hidden_states):
+ # down project hidden_states if necessary
+ if self.proj is not None and self.proj_layer_norm is not None:
+ hidden_states = self.proj(hidden_states)
+ hidden_states = self.proj_layer_norm(hidden_states)
+
+ hidden_states = hidden_states.transpose([0, 2, 1])
+
+ for layer in self.layers:
+ layerdrop_prob = np.random.random()
+ if not self.training or (layerdrop_prob > self.layerdrop):
+ hidden_states = layer(hidden_states)
+
+ hidden_states = hidden_states.transpose([0, 2, 1])
+ return hidden_states
+
+
+class Wav2Vec2AdapterLayer(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.conv = nn.Conv1D(
+ config.output_hidden_size,
+ 2 * config.output_hidden_size,
+ config.adapter_kernel_size,
+ stride=config.adapter_stride,
+ padding=1,
+ )
+
+ def forward(self, hidden_states):
+ hidden_states = self.conv(hidden_states)
+ hidden_states = nn.functional.glu(hidden_states, axis=1)
+
+ return hidden_states
+
+
+class Wav2Vec2Model(nn.Layer):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+ self.feature_extractor = Wav2Vec2FeatureEncoder(config)
+ self.feature_projection = Wav2Vec2FeatureProjection(config)
+
+ # model only needs masking vector if mask prob is > 0.0
+ if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+ # self.masked_spec_embed = nn.Parameter(paddle.Tensor(config.hidden_size).uniform_())
+ #self.masked_spec_embed = paddle.uniform([config.hidden_size])
+ self.masked_spec_embed = paddle.static.create_parameter(shape=[config.hidden_size], dtype='float32', default_initializer=paddle.nn.initializer.Uniform(low=0, high=1.0))
+ if config.do_stable_layer_norm:
+ self.encoder = Wav2Vec2EncoderStableLayerNorm(config)
+ else:
+ self.encoder = Wav2Vec2Encoder(config)
+
+ self.adapter = Wav2Vec2Adapter(config) if config.add_adapter else None
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def freeze_feature_extractor(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+ not be updated during training.
+ """
+ warnings.warn(
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
+ FutureWarning,
+ )
+ self.freeze_feature_encoder()
+
+ def freeze_feature_encoder(self):
+ """
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+ not be updated during training.
+ """
+ self.feature_extractor._freeze_parameters()
+
+ def _mask_hidden_states(
+ self,
+ hidden_states: paddle.Tensor,
+ mask_time_indices: Optional[paddle.Tensor] = None,
+ attention_mask: Optional[paddle.Tensor] = None,
+ ):
+ """
+ Masks extracted features along time axis and/or along feature axis according to
+ [SpecAugment](https://arxiv.org/abs/1904.08779).
+ """
+
+ # `config.apply_spec_augment` can set masking to False
+ if not getattr(self.config, "apply_spec_augment", True):
+ return hidden_states
+
+ # generate indices & apply SpecAugment along time axis
+ batch_size, sequence_length, hidden_size = hidden_states.shape
+
+ if mask_time_indices is not None:
+ # apply SpecAugment along time axis with given mask_time_indices
+ hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+ elif self.config.mask_time_prob > 0 and self.training:
+ mask_time_indices = _compute_mask_indices(
+ (batch_size, sequence_length),
+ mask_prob=self.config.mask_time_prob,
+ mask_length=self.config.mask_time_length,
+ attention_mask=attention_mask,
+ min_masks=self.config.mask_time_min_masks,
+ )
+ mask_time_indices = paddle.to_tensor(mask_time_indices, dtype=paddle.bool)
+ hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+ if self.config.mask_feature_prob > 0 and self.training:
+ # generate indices & apply SpecAugment along feature axis
+ mask_feature_indices = _compute_mask_indices(
+ (batch_size, hidden_size),
+ mask_prob=self.config.mask_feature_prob,
+ mask_length=self.config.mask_feature_length,
+ min_masks=self.config.mask_feature_min_masks,
+ )
+ mask_feature_indices = paddle.to_tensor(mask_feature_indices, dtype=paddle.bool)
+ mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+ hidden_states[mask_feature_indices] = 0
+
+ return hidden_states
+
+ def forward(
+ self,
+ input_values: Optional[paddle.Tensor],
+ attention_mask: Optional[paddle.Tensor] = None,
+ mask_time_indices: Optional[paddle.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+ import numpy as np
+ np.save("data/paddle_input_values.npy", input_values.numpy())
+ extract_features = self.feature_extractor(input_values)
+ extract_features = extract_features.transpose([0, 2, 1])
+
+ if attention_mask is not None:
+ # compute reduced attention_mask corresponding to feature vectors
+ attention_mask = self._get_feature_vector_attention_mask(
+ extract_features.shape[1], attention_mask, add_adapter=False
+ )
+ np.save("data/paddle_extract_features.npy", extract_features.numpy())
+ hidden_states, extract_features = self.feature_projection(extract_features)
+ np.save("data/paddle_feature_projection.npy", hidden_states.numpy())
+ hidden_states = self._mask_hidden_states(
+ hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+ )
+
+ encoder_outputs = self.encoder(
+ hidden_states,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = encoder_outputs[0]
+ np.save("data/paddle_encoder_outputs.npy", hidden_states.numpy())
+
+ if self.adapter is not None:
+ hidden_states = self.adapter(hidden_states)
+
+ if not return_dict:
+ return (hidden_states, extract_features) + encoder_outputs[1:]
+
+ return Wav2Vec2BaseModelOutput(
+ last_hidden_state=hidden_states,
+ extract_features=extract_features,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+ def post_init(self):
+ """
+ A method executed at the end of each Transformer model initialization, to execute code that needs the model's
+ modules properly initialized (such as weight initialization).
+ """
+ # self.init_weights()
+ # self._backward_compatibility_gradient_checkpointing()
+ pass
+
+class Wav2Vec2ConfigPure():
+ model_type = "wav2vec2"
+ def __init__(
+ self,
+ vocab_size=32,
+ hidden_size=1024,
+ num_hidden_layers=24,
+ num_attention_heads=16,
+ intermediate_size=4096,
+ hidden_act="gelu",
+ hidden_dropout=0.1,
+ activation_dropout=0.1,
+ attention_dropout=0.1,
+ feat_proj_dropout=0.1,
+ feat_quantizer_dropout=0.0,
+ final_dropout=0.1,
+ layerdrop=0.1,
+ initializer_range=0.02,
+ layer_norm_eps=1e-5,
+ feat_extract_norm="layer",
+ feat_extract_activation="gelu",
+ conv_dim=(512, 512, 512, 512, 512, 512, 512),
+ conv_stride=(5, 2, 2, 2, 2, 2, 2),
+ conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+ conv_bias=True,
+ num_conv_pos_embeddings=128,
+ num_conv_pos_embedding_groups=16,
+ do_stable_layer_norm=True,
+ apply_spec_augment=True,
+ mask_time_prob=0.05,
+ mask_time_length=10,
+ mask_time_min_masks=2,
+ mask_feature_prob=0.0,
+ mask_feature_length=10,
+ mask_feature_min_masks=0,
+ num_codevectors_per_group=320,
+ num_codevector_groups=2,
+ contrastive_logits_temperature=0.1,
+ num_negatives=100,
+ codevector_dim=256,
+ proj_codevector_dim=256,
+ diversity_loss_weight=0.1,
+ ctc_loss_reduction="sum",
+ ctc_zero_infinity=False,
+ use_weighted_layer_sum=False,
+ pad_token_id=0,
+ bos_token_id=1,
+ eos_token_id=2,
+ add_adapter=False,
+ adapter_kernel_size=3,
+ adapter_stride=2,
+ num_adapter_layers=3,
+ output_hidden_size=None,
+ **kwargs
+ ):
+ self.output_attentions = False
+ self.output_hidden_states = False
+ self.use_return_dict = True
+
+ self.pad_token_id = pad_token_id
+ self.bos_token_id = bos_token_id
+ self.eos_token_id = eos_token_id
+ self.hidden_size = hidden_size
+ self.feat_extract_norm = feat_extract_norm
+ self.feat_extract_activation = feat_extract_activation
+ self.conv_dim = list(conv_dim)
+ self.conv_stride = list(conv_stride)
+ self.conv_kernel = list(conv_kernel)
+ self.conv_bias = conv_bias
+ self.num_conv_pos_embeddings = num_conv_pos_embeddings
+ self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+ self.num_feat_extract_layers = len(self.conv_dim)
+ self.num_hidden_layers = num_hidden_layers
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.num_attention_heads = num_attention_heads
+ self.hidden_dropout = hidden_dropout
+ self.attention_dropout = attention_dropout
+ self.activation_dropout = activation_dropout
+ self.feat_proj_dropout = feat_proj_dropout
+ self.final_dropout = final_dropout
+ self.layerdrop = layerdrop
+ self.layer_norm_eps = layer_norm_eps
+ self.initializer_range = initializer_range
+ self.vocab_size = vocab_size
+ self.do_stable_layer_norm = do_stable_layer_norm
+ self.use_weighted_layer_sum = use_weighted_layer_sum
+
+ if (
+ (len(self.conv_stride) != self.num_feat_extract_layers)
+ or (len(self.conv_kernel) != self.num_feat_extract_layers)
+ or (len(self.conv_dim) != self.num_feat_extract_layers)
+ ):
+ raise ValueError(
+ "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+ " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+ f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+ f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+ )
+
+ # fine-tuning config parameters for SpecAugment: https://arxiv.org/abs/1904.08779
+ self.apply_spec_augment = apply_spec_augment
+ self.mask_time_prob = mask_time_prob
+ self.mask_time_length = mask_time_length
+ self.mask_time_min_masks = mask_time_min_masks
+ self.mask_feature_prob = mask_feature_prob
+ self.mask_feature_length = mask_feature_length
+ self.mask_feature_min_masks = mask_feature_min_masks
+
+ # parameters for pretraining with codevector quantized representations
+ self.num_codevectors_per_group = num_codevectors_per_group
+ self.num_codevector_groups = num_codevector_groups
+ self.contrastive_logits_temperature = contrastive_logits_temperature
+ self.feat_quantizer_dropout = feat_quantizer_dropout
+ self.num_negatives = num_negatives
+ self.codevector_dim = codevector_dim
+ self.proj_codevector_dim = proj_codevector_dim
+ self.diversity_loss_weight = diversity_loss_weight
+
+ # ctc loss
+ self.ctc_loss_reduction = ctc_loss_reduction
+ self.ctc_zero_infinity = ctc_zero_infinity
+
+ # adapter
+ self.add_adapter = add_adapter
+ self.adapter_kernel_size = adapter_kernel_size
+ self.adapter_stride = adapter_stride
+ self.num_adapter_layers = num_adapter_layers
+ self.output_hidden_size = output_hidden_size or hidden_size
+
+ @property
+ def inputs_to_logits_ratio(self):
+ return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+def main():
+ config = Wav2Vec2ConfigPure()
+ model = Wav2Vec2Model(config)
+ model_dict = model.state_dict()
+# checkpoint_path = "wav2vec2_test"
+# params_path = checkpoint_path + ".pdparams"
+# paddle.save(model_dict, params_path)
+ revise_params_path = "exp/wav2vec2-large-960h-lv60-self.pdparams"
+ model_dict_revise = paddle.load(revise_params_path)
+ model.set_state_dict(model_dict_revise)
+ model.training = False
+ model.eval()
+ input_values = np.load("input_values.npy")
+ input_values = paddle.to_tensor(input_values)
+ outputs = model(input_values)
+ last_hidden_state = outputs.last_hidden_state
+ extract_features = outputs.extract_features
+ hidden_states = outputs.hidden_states
+ attentions = outputs.attentions
+ print (last_hidden_state)
+ np.save("paddle_last_hidden_state.npy", last_hidden_state.numpy())
+ print ("extract_features")
+ print (extract_features)
+ np.save("paddle_extract_features.npy", extract_features.numpy())
+ print ("hidden_states")
+ print (hidden_states)
+ print ("attentions")
+ print (attentions)
+ return
+ logits = logits.numpy()
+ np.save("paddle_logits.npy", logits)
+
+if __name__ == "__main__":
+ main()
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py
new file mode 100644
index 000000000..057be1d46
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/augment.py
@@ -0,0 +1,359 @@
+import os
+import paddle
+import speechbrain as sb
+from speechbrain.processing.speech_augmentation import (
+ SpeedPerturb,
+ DropFreq,
+ DropChunk,
+)
+
+
+class TimeDomainSpecAugment(paddle.nn.Layer):
+ """A time-domain approximation of the SpecAugment algorithm.
+ This augmentation module implements three augmentations in
+ the time-domain.
+ 1. Drop chunks of the audio (zero amplitude or white noise)
+ 2. Drop frequency bands (with band-drop filters)
+ 3. Speed peturbation (via resampling to slightly different rate)
+ Arguments
+ ---------
+ perturb_prob : float from 0 to 1
+ The probability that a batch will have speed perturbation applied.
+ drop_freq_prob : float from 0 to 1
+ The probability that a batch will have frequencies dropped.
+ drop_chunk_prob : float from 0 to 1
+ The probability that a batch will have chunks dropped.
+ speeds : list of ints
+ A set of different speeds to use to perturb each batch.
+ See ``speechbrain.processing.speech_augmentation.SpeedPerturb``
+ sample_rate : int
+ Sampling rate of the input waveforms.
+ drop_freq_count_low : int
+ Lowest number of frequencies that could be dropped.
+ drop_freq_count_high : int
+ Highest number of frequencies that could be dropped.
+ drop_chunk_count_low : int
+ Lowest number of chunks that could be dropped.
+ drop_chunk_count_high : int
+ Highest number of chunks that could be dropped.
+ drop_chunk_length_low : int
+ Lowest length of chunks that could be dropped.
+ drop_chunk_length_high : int
+ Highest length of chunks that could be dropped.
+ drop_chunk_noise_factor : float
+ The noise factor used to scale the white noise inserted, relative to
+ the average amplitude of the utterance. Default 0 (no noise inserted).
+ Example
+ -------
+ >>> inputs = torch.randn([10, 16000])
+ >>> feature_maker = TimeDomainSpecAugment(speeds=[80])
+ >>> feats = feature_maker(inputs, torch.ones(10))
+ >>> feats.shape
+ torch.Size([10, 12800])
+ """
+
+ def __init__(
+ self,
+ perturb_prob=1.0,
+ drop_freq_prob=1.0,
+ drop_chunk_prob=1.0,
+ speeds=[95, 100, 105],
+ sample_rate=16000,
+ drop_freq_count_low=0,
+ drop_freq_count_high=3,
+ drop_chunk_count_low=0,
+ drop_chunk_count_high=5,
+ drop_chunk_length_low=1000,
+ drop_chunk_length_high=2000,
+ drop_chunk_noise_factor=0,
+ ):
+ super().__init__()
+ self.speed_perturb = SpeedPerturb(
+ perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds
+ )
+ self.drop_freq = DropFreq(
+ drop_prob=drop_freq_prob,
+ drop_count_low=drop_freq_count_low,
+ drop_count_high=drop_freq_count_high,
+ )
+ self.drop_chunk = DropChunk(
+ drop_prob=drop_chunk_prob,
+ drop_count_low=drop_chunk_count_low,
+ drop_count_high=drop_chunk_count_high,
+ drop_length_low=drop_chunk_length_low,
+ drop_length_high=drop_chunk_length_high,
+ noise_factor=drop_chunk_noise_factor,
+ )
+
+ def forward(self, waveforms, lengths):
+ """Returns the distorted waveforms.
+ Arguments
+ ---------
+ waveforms : torch.Tensor
+ The waveforms to distort
+ """
+ # Augmentation
+ with paddle.no_grad():
+ waveforms = self.speed_perturb(waveforms)
+ waveforms = self.drop_freq(waveforms)
+ waveforms = self.drop_chunk(waveforms, lengths)
+
+ return
+
+
+class DropFreq(torch.nn.Module):
+ """This class drops a random frequency from the signal.
+ The purpose of this class is to teach models to learn to rely on all parts
+ of the signal, not just a few frequency bands.
+ Arguments
+ ---------
+ drop_freq_low : float
+ The low end of frequencies that can be dropped,
+ as a fraction of the sampling rate / 2.
+ drop_freq_high : float
+ The high end of frequencies that can be
+ dropped, as a fraction of the sampling rate / 2.
+ drop_count_low : int
+ The low end of number of frequencies that could be dropped.
+ drop_count_high : int
+ The high end of number of frequencies that could be dropped.
+ drop_width : float
+ The width of the frequency band to drop, as
+ a fraction of the sampling_rate / 2.
+ drop_prob : float
+ The probability that the batch of signals will have a frequency
+ dropped. By default, every batch has frequencies dropped.
+ Example
+ -------
+ >>> from speechbrain.dataio.dataio import read_audio
+ >>> dropper = DropFreq()
+ >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+ >>> dropped_signal = dropper(signal.unsqueeze(0))
+ """
+
+ def __init__(
+ self,
+ drop_freq_low=1e-14,
+ drop_freq_high=1,
+ drop_count_low=1,
+ drop_count_high=2,
+ drop_width=0.05,
+ drop_prob=1,
+ ):
+ super().__init__()
+ self.drop_freq_low = drop_freq_low
+ self.drop_freq_high = drop_freq_high
+ self.drop_count_low = drop_count_low
+ self.drop_count_high = drop_count_high
+ self.drop_width = drop_width
+ self.drop_prob = drop_prob
+
+ def forward(self, waveforms):
+ """
+ Arguments
+ ---------
+ waveforms : tensor
+ Shape should be `[batch, time]` or `[batch, time, channels]`.
+ Returns
+ -------
+ Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+ """
+
+ # Don't drop (return early) 1-`drop_prob` portion of the batches
+ dropped_waveform = waveforms.clone()
+ if torch.rand(1) > self.drop_prob:
+ return dropped_waveform
+
+ # Add channels dimension
+ if len(waveforms.shape) == 2:
+ dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+ # Pick number of frequencies to drop
+ drop_count = torch.randint(
+ low=self.drop_count_low, high=self.drop_count_high + 1, size=(1,),
+ )
+
+ # Pick a frequency to drop
+ drop_range = self.drop_freq_high - self.drop_freq_low
+ drop_frequency = (
+ torch.rand(drop_count) * drop_range + self.drop_freq_low
+ )
+
+ # Filter parameters
+ filter_length = 101
+ pad = filter_length // 2
+
+ # Start with delta function
+ drop_filter = torch.zeros(1, filter_length, 1, device=waveforms.device)
+ drop_filter[0, pad, 0] = 1
+
+ # Subtract each frequency
+ for frequency in drop_frequency:
+ notch_kernel = notch_filter(
+ frequency, filter_length, self.drop_width,
+ ).to(waveforms.device)
+ drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+ # Apply filter
+ dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+ # Remove channels dimension if added
+ return dropped_waveform.squeeze(-1)
+
+class DropChunk(torch.nn.Module):
+ """This class drops portions of the input signal.
+ Using `DropChunk` as an augmentation strategy helps a models learn to rely
+ on all parts of the signal, since it can't expect a given part to be
+ present.
+ Arguments
+ ---------
+ drop_length_low : int
+ The low end of lengths for which to set the
+ signal to zero, in samples.
+ drop_length_high : int
+ The high end of lengths for which to set the
+ signal to zero, in samples.
+ drop_count_low : int
+ The low end of number of times that the signal
+ can be dropped to zero.
+ drop_count_high : int
+ The high end of number of times that the signal
+ can be dropped to zero.
+ drop_start : int
+ The first index for which dropping will be allowed.
+ drop_end : int
+ The last index for which dropping will be allowed.
+ drop_prob : float
+ The probability that the batch of signals will
+ have a portion dropped. By default, every batch
+ has portions dropped.
+ noise_factor : float
+ The factor relative to average amplitude of an utterance
+ to use for scaling the white noise inserted. 1 keeps
+ the average amplitude the same, while 0 inserts all 0's.
+ Example
+ -------
+ >>> from speechbrain.dataio.dataio import read_audio
+ >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
+ >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+ >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+ >>> length = torch.ones(1)
+ >>> dropped_signal = dropper(signal, length)
+ >>> float(dropped_signal[:, 150])
+ 0.0
+ """
+
+ def __init__(
+ self,
+ drop_length_low=100,
+ drop_length_high=1000,
+ drop_count_low=1,
+ drop_count_high=10,
+ drop_start=0,
+ drop_end=None,
+ drop_prob=1,
+ noise_factor=0.0,
+ ):
+ super().__init__()
+ self.drop_length_low = drop_length_low
+ self.drop_length_high = drop_length_high
+ self.drop_count_low = drop_count_low
+ self.drop_count_high = drop_count_high
+ self.drop_start = drop_start
+ self.drop_end = drop_end
+ self.drop_prob = drop_prob
+ self.noise_factor = noise_factor
+
+ # Validate low < high
+ if drop_length_low > drop_length_high:
+ raise ValueError("Low limit must not be more than high limit")
+ if drop_count_low > drop_count_high:
+ raise ValueError("Low limit must not be more than high limit")
+
+ # Make sure the length doesn't exceed end - start
+ if drop_end is not None and drop_end >= 0:
+ if drop_start > drop_end:
+ raise ValueError("Low limit must not be more than high limit")
+
+ drop_range = drop_end - drop_start
+ self.drop_length_low = min(drop_length_low, drop_range)
+ self.drop_length_high = min(drop_length_high, drop_range)
+
+ def forward(self, waveforms, lengths):
+ """
+ Arguments
+ ---------
+ waveforms : tensor
+ Shape should be `[batch, time]` or `[batch, time, channels]`.
+ lengths : tensor
+ Shape should be a single dimension, `[batch]`.
+ Returns
+ -------
+ Tensor of shape `[batch, time]` or
+ `[batch, time, channels]`
+ """
+
+ # Reading input list
+ lengths = (lengths * waveforms.size(1)).long()
+ batch_size = waveforms.size(0)
+ dropped_waveform = waveforms.clone()
+
+ # Don't drop (return early) 1-`drop_prob` portion of the batches
+ if torch.rand(1) > self.drop_prob:
+ return dropped_waveform
+
+ # Store original amplitude for computing white noise amplitude
+ clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+ # Pick a number of times to drop
+ drop_times = torch.randint(
+ low=self.drop_count_low,
+ high=self.drop_count_high + 1,
+ size=(batch_size,),
+ )
+
+ # Iterate batch to set mask
+ for i in range(batch_size):
+ if drop_times[i] == 0:
+ continue
+
+ # Pick lengths
+ length = torch.randint(
+ low=self.drop_length_low,
+ high=self.drop_length_high + 1,
+ size=(drop_times[i],),
+ )
+
+ # Compute range of starting locations
+ start_min = self.drop_start
+ if start_min < 0:
+ start_min += lengths[i]
+ start_max = self.drop_end
+ if start_max is None:
+ start_max = lengths[i]
+ if start_max < 0:
+ start_max += lengths[i]
+ start_max = max(0, start_max - length.max())
+
+ # Pick starting locations
+ start = torch.randint(
+ low=start_min, high=start_max + 1, size=(drop_times[i],),
+ )
+
+ end = start + length
+
+ # Update waveform
+ if not self.noise_factor:
+ for j in range(drop_times[i]):
+ dropped_waveform[i, start[j] : end[j]] = 0.0
+ else:
+ # Uniform distribution of -2 to +2 * avg amplitude should
+ # preserve the average for normalization
+ noise_max = 2 * clean_amplitude[i] * self.noise_factor
+ for j in range(drop_times[i]):
+ # zero-center the noise distribution
+ noise_vec = torch.rand(length[j], device=waveforms.device)
+ noise_vec = 2 * noise_max * noise_vec - noise_max
+ dropped_waveform[i, start[j] : end[j]] = noise_vec
+
+ return
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py
new file mode 100644
index 000000000..8eb56e759
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/VanillaNN.py
@@ -0,0 +1,45 @@
+"""Vanilla Neural Network for simple tests.
+Authors
+* Elena Rastorgueva 2020
+"""
+import paddle
+from paddlespeech.s2t.models.wav2vec2.speechbrain.nnet import containers
+import paddlespeech.s2t.models.wav2vec2.speechbrain as sb
+
+
+class VanillaNN(containers.Sequential):
+ """A simple vanilla Deep Neural Network.
+ Arguments
+ ---------
+ activation : paddle class
+ A class used for constructing the activation layers.
+ dnn_blocks : int
+ The number of linear neural blocks to include.
+ dnn_neurons : int
+ The number of neurons in the linear layers.
+ Example
+ -------
+ >>> inputs = paddle.rand([10, 120, 60])
+ >>> model = VanillaNN(input_shape=inputs.shape)
+ >>> outputs = model(inputs)
+ >>> outputs.shape
+ paddle.shape([10, 120, 512])
+ """
+
+ def __init__(
+ self,
+ input_shape,
+ activation=paddle.nn.LeakyReLU,
+ dnn_blocks=2,
+ dnn_neurons=512,
+ ):
+ super().__init__(input_shape=input_shape)
+
+ for block_index in range(dnn_blocks):
+ self.append(
+ sb.nnet.linear.Linear,
+ n_neurons=dnn_neurons,
+ bias=True,
+ layer_name="linear",
+ )
+ self.append(activation(), layer_name="act")
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/lobes/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py
new file mode 100644
index 000000000..f8f087714
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/__init__.py
@@ -0,0 +1,2 @@
+from . import linear
+from . import containers
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py
new file mode 100644
index 000000000..078806902
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/containers.py
@@ -0,0 +1,132 @@
+import paddle
+import inspect
+import logging
+import operator
+import functools
+
+class Sequential(paddle.nn.LayerDict):
+ """A sequence of modules with potentially inferring shape on construction.
+ If layers are passed with names, these can be referenced with dot notation.
+ Arguments
+ ---------
+ input_shape : iterable
+ A list or tuple of ints or None, representing the expected shape of an
+ input tensor. None represents a variable-length dimension. If no
+ ``input_shape`` is passed, no shape inference will be performed.
+ *layers, **named_layers
+ The inputs are treated as a list of layers to be
+ applied in sequence. The output shape of each layer is used to
+ infer the shape of the following layer. If a tuple is returned,
+ only the shape of the first element is used to determine input
+ shape of the next layer (e.g. RNN returns output, hidden).
+ Example
+ -------
+ >>> inputs = paddle.rand(10, 40, 50)
+ >>> model = Sequential(input_shape=inputs.shape)
+ >>> model.append(Linear, n_neurons=100, layer_name="layer1")
+ >>> model.append(Linear, n_neurons=200, layer_name="layer2")
+ >>> outputs = model(inputs)
+ >>> outputs.shape
+ paddle.shape([10, 40, 200])
+ >>> outputs = model.layer1(inputs)
+ >>> outputs.shape
+ paddle.shape([10, 40, 100])
+ """
+
+ def __init__(self, *layers, input_shape=None, **named_layers):
+ super().__init__()
+
+ # Make sure either layers or input_shape is passed
+ if not layers and input_shape is None and not named_layers:
+ raise ValueError("Must pass either layers or input shape")
+
+ # Keep track of what layers need "lengths" passed
+ self.length_layers = []
+
+ # Replace None dimensions with arbitrary value
+ self.input_shape = input_shape
+ if input_shape and None in input_shape:
+ self.input_shape = list(input_shape)
+ for i, dim in enumerate(self.input_shape):
+
+ # To reduce size of dummy tensors, use 1 for batch dim
+ if i == 0 and dim is None:
+ dim = 1
+
+ # Use 64 as nice round arbitrary value, big enough that
+ # halving this dimension a few times doesn't reach 1
+ self.input_shape[i] = dim or 256
+
+ # Append non-named layers
+ for layer in layers:
+ self.append(layer)
+
+ # Append named layers
+ for name, layer in named_layers.items():
+ self.append(layer, layer_name=name)
+
+ def append(self, layer, *args, layer_name=None, **kwargs):
+ """Add a layer to the list of layers, inferring shape if necessary.
+ Arguments
+ ---------
+ layer : A paddle.nn.Module class or object
+ If the layer is a class, it should accept an argument called
+ ``input_shape`` which will be inferred and passed. If the layer
+ is a module object, it is added as-is.
+ layer_name : str
+ The name of the layer, for reference. If the name is in use,
+ ``_{count}`` will be appended.
+ *args, **kwargs
+ These are passed to the layer if it is constructed.
+ """
+
+ # Compute layer_name
+ if layer_name is None:
+ layer_name = str(len(self))
+ elif layer_name in self:
+ index = 0
+ while f"{layer_name}_{index}" in self:
+ index += 1
+ layer_name = f"{layer_name}_{index}"
+
+ # Check if it needs to be constructed with input shape
+ if self.input_shape:
+ argspec = inspect.getfullargspec(layer)
+ if "input_shape" in argspec.args + argspec.kwonlyargs:
+ input_shape = self.get_output_shape()
+ layer = layer(*args, input_shape=input_shape, **kwargs)
+
+ # Finally, append the layer.
+ try:
+ self[layer_name] = layer
+ # self.add_module(layer_name, layer)
+ except TypeError:
+ raise ValueError(
+ "Must pass `input_shape` at initialization and use "
+ "modules that take `input_shape` to infer shape when "
+ "using `append()`."
+ )
+
+ def get_output_shape(self):
+ """Returns expected shape of the output.
+ Computed by passing dummy input constructed with the
+ ``self.input_shape`` attribute.
+ """
+ with paddle.no_grad():
+ dummy_input = paddle.zeros(self.input_shape)
+ dummy_output = self(dummy_input)
+ return dummy_output.shape
+
+ def forward(self, x):
+ """Applies layers in sequence, passing only the first element of tuples.
+ Arguments
+ ---------
+ x : paddle.Tensor
+ The input tensor to run through the network.
+ """
+ for layer in self.values():
+ x = layer(x)
+ if isinstance(x, tuple):
+ x = x[0]
+
+ return x
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py
new file mode 100644
index 000000000..26389d908
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/nnet/linear.py
@@ -0,0 +1,73 @@
+"""Library implementing linear transformation.
+Authors
+ * Mirco Ravanelli 2020
+ * Davide Borra 2021
+"""
+
+import logging
+import paddle
+import paddle.nn as nn
+from paddlespeech.s2t.modules import align
+
+logger = logging.getLogger(__name__)
+
+
+class Linear(paddle.nn.Layer):
+ """Computes a linear transformation y = wx + b.
+ Arguments
+ ---------
+ n_neurons : int
+ It is the number of output neurons (i.e, the dimensionality of the
+ output).
+ input_shape: tuple
+ It is the shape of the input tensor.
+ input_size: int
+ Size of the input tensor.
+ bias : bool
+ If True, the additive bias b is adopted.
+ combine_dims : bool
+ If True and the input is 4D, combine 3rd and 4th dimensions of input.
+ Example
+ -------
+ >>> inputs = paddle.rand(10, 50, 40)
+ >>> lin_t = Linear(input_shape=(10, 50, 40), n_neurons=100)
+ >>> output = lin_t(inputs)
+ >>> output.shape
+ paddle.shape([10, 50, 100])
+ """
+
+ def __init__(
+ self,
+ n_neurons,
+ input_shape=None,
+ input_size=None,
+ bias=True,
+ combine_dims=False,
+ ):
+ super().__init__()
+ self.combine_dims = combine_dims
+
+ if input_shape is None and input_size is None:
+ raise ValueError("Expected one of input_shape or input_size")
+
+ if input_size is None:
+ input_size = input_shape[-1]
+ if len(input_shape) == 4 and self.combine_dims:
+ input_size = input_shape[2] * input_shape[3]
+
+ # Weights are initialized following paddle approach
+ self.w = align.Linear(input_size, n_neurons, bias_attr=bias)
+
+ def forward(self, x):
+ """Returns the linear transformation of input tensor.
+ Arguments
+ ---------
+ x : paddle.Tensor
+ Input to transform linearly.
+ """
+ if x.rank == 4 and self.combine_dims:
+ x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3])
+
+ wx = self.w(x)
+
+ return wx
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py
new file mode 100644
index 000000000..aeae11c0b
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/signal_processing.py
@@ -0,0 +1,256 @@
+"""
+Low level signal processing utilities
+Authors
+ * Peter Plantinga 2020
+ * Francois Grondin 2020
+ * William Aris 2020
+ * Samuele Cornell 2020
+ * Sarthak Yadav 2022
+"""
+import paddle
+import math
+from packaging import version
+import numpy as np
+
+def blackman_window(window_length, periodic=True):
+ if window_length == 0:
+ return []
+ if window_length == 1:
+ return paddle.ones([1])
+ if periodic:
+ window_length += 1
+
+
+
+
+ window = paddle.arange(window_length) * (np.pi / (window_length - 1))
+ window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42
+ return window[:-1] if periodic else window
+
+
+def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
+ """Compute amplitude of a batch of waveforms.
+ Arguments
+ ---------
+ waveform : tensor
+ The waveforms used for computing amplitude.
+ Shape should be `[time]` or `[batch, time]` or
+ `[batch, time, channels]`.
+ lengths : tensor
+ The lengths of the waveforms excluding the padding.
+ Shape should be a single dimension, `[batch]`.
+ amp_type : str
+ Whether to compute "avg" average or "peak" amplitude.
+ Choose between ["avg", "peak"].
+ scale : str
+ Whether to compute amplitude in "dB" or "linear" scale.
+ Choose between ["linear", "dB"].
+ Returns
+ -------
+ The average amplitude of the waveforms.
+ Example
+ -------
+ >>> signal = torch.sin(torch.arange(16000.0)).unsqueeze(0)
+ >>> compute_amplitude(signal, signal.size(1))
+ tensor([[0.6366]])
+ """
+ if len(waveforms.shape) == 1:
+ waveforms = waveforms.unsqueeze(0)
+
+ assert amp_type in ["avg", "peak"]
+ assert scale in ["linear", "dB"]
+
+ if amp_type == "avg":
+ if lengths is None:
+ out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
+ else:
+ wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
+ out = wav_sum / lengths
+ elif amp_type == "peak":
+ out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)[0]
+ else:
+ raise NotImplementedError
+
+ if scale == "linear":
+ return out
+ elif scale == "dB":
+ return paddle.clip(20 * paddle.log10(out), min=-80) # clamp zeros
+ else:
+ raise NotImplementedError
+
+
+def convolve1d(
+ waveform,
+ kernel,
+ padding=0,
+ pad_type="constant",
+ stride=1,
+ groups=1,
+ use_fft=False,
+ rotation_index=0,
+):
+ """Use torch.nn.functional to perform 1d padding and conv.
+ Arguments
+ ---------
+ waveform : tensor
+ The tensor to perform operations on.
+ kernel : tensor
+ The filter to apply during convolution.
+ padding : int or tuple
+ The padding (pad_left, pad_right) to apply.
+ If an integer is passed instead, this is passed
+ to the conv1d function and pad_type is ignored.
+ pad_type : str
+ The type of padding to use. Passed directly to
+ `torch.nn.functional.pad`, see PyTorch documentation
+ for available options.
+ stride : int
+ The number of units to move each time convolution is applied.
+ Passed to conv1d. Has no effect if `use_fft` is True.
+ groups : int
+ This option is passed to `conv1d` to split the input into groups for
+ convolution. Input channels should be divisible by the number of groups.
+ use_fft : bool
+ When `use_fft` is passed `True`, then compute the convolution in the
+ spectral domain using complex multiply. This is more efficient on CPU
+ when the size of the kernel is large (e.g. reverberation). WARNING:
+ Without padding, circular convolution occurs. This makes little
+ difference in the case of reverberation, but may make more difference
+ with different kernels.
+ rotation_index : int
+ This option only applies if `use_fft` is true. If so, the kernel is
+ rolled by this amount before convolution to shift the output location.
+ Returns
+ -------
+ The convolved waveform.
+ Example
+ -------
+ >>> from speechbrain.dataio.dataio import read_audio
+ >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+ >>> signal = signal.unsqueeze(0).unsqueeze(2)
+ >>> kernel = torch.rand(1, 10, 1)
+ >>> signal = convolve1d(signal, kernel, padding=(9, 0))
+ """
+ if len(waveform.shape) != 3:
+ raise ValueError("Convolve1D expects a 3-dimensional tensor")
+
+ # Move time dimension last, which pad and fft and conv expect.
+ waveform = waveform.transpose([0, 2, 1])
+ kernel = kernel.transpose([0, 2, 1])
+
+ # Padding can be a tuple (left_pad, right_pad) or an int
+ if isinstance(padding, tuple):
+ waveform = paddle.nn.functional.pad(
+ x=waveform, pad=padding, mode=pad_type,
+ )
+
+ # This approach uses FFT, which is more efficient if the kernel is large
+ if use_fft:
+
+ # Pad kernel to same length as signal, ensuring correct alignment
+ zero_length = waveform.shape[-1] - kernel.shape[-1]
+
+ # Handle case where signal is shorter
+ if zero_length < 0:
+ kernel = kernel[..., :zero_length]
+ zero_length = 0
+
+ # Perform rotation to ensure alignment
+ zeros = paddle.zeros(
+ kernel.shape[0], kernel.shape[1], zero_length
+ )
+ after_index = kernel[..., rotation_index:]
+ before_index = kernel[..., :rotation_index]
+ kernel = paddle.concat((after_index, zeros, before_index), axis=-1)
+
+ # Multiply in frequency domain to convolve in time domain
+ # if version.parse(torch.__version__) > version.parse("1.6.0"):
+ import paddle.fft as fft
+
+ result = fft.rfft(waveform) * fft.rfft(kernel)
+ convolved = fft.irfft(result, n=waveform.shape[-1])
+ # else:
+ # f_signal = torch.rfft(waveform, 1)
+ # f_kernel = torch.rfft(kernel, 1)
+ # sig_real, sig_imag = f_signal.unbind(-1)
+ # ker_real, ker_imag = f_kernel.unbind(-1)
+ # f_result = torch.stack(
+ # [
+ # sig_real * ker_real - sig_imag * ker_imag,
+ # sig_real * ker_imag + sig_imag * ker_real,
+ # ],
+ # dim=-1,
+ # )
+ # convolved = torch.irfft(
+ # f_result, 1, signal_sizes=[waveform.size(-1)]
+ # )
+
+ # Use the implementation given by torch, which should be efficient on GPU
+ else:
+ convolved = paddle.nn.functional.conv1d(
+ x=waveform,
+ weight=kernel,
+ stride=stride,
+ groups=groups,
+ padding=padding if not isinstance(padding, tuple) else 0,
+ )
+
+ # Return time dimension to the second dimension.
+ return convolved.transpose([0, 2, 1])
+
+def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
+ """Returns a notch filter constructed from a high-pass and low-pass filter.
+ (from https://tomroelandts.com/articles/
+ how-to-create-simple-band-pass-and-band-reject-filters)
+ Arguments
+ ---------
+ notch_freq : float
+ frequency to put notch as a fraction of the
+ sampling rate / 2. The range of possible inputs is 0 to 1.
+ filter_width : int
+ Filter width in samples. Longer filters have
+ smaller transition bands, but are more inefficient.
+ notch_width : float
+ Width of the notch, as a fraction of the sampling_rate / 2.
+ Example
+ -------
+ >>> from speechbrain.dataio.dataio import read_audio
+ >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+ >>> signal = signal.unsqueeze(0).unsqueeze(2)
+ >>> kernel = notch_filter(0.25)
+ >>> notched_signal = convolve1d(signal, kernel)
+ """
+
+ # Check inputs
+ assert 0 < notch_freq <= 1
+ assert filter_width % 2 != 0
+ pad = filter_width // 2
+ inputs = paddle.arange(filter_width) - pad
+
+ # Avoid frequencies that are too low
+ notch_freq += notch_width
+
+ # Define sinc function, avoiding division by zero
+ def sinc(x):
+ "Computes the sinc function."
+
+ def _sinc(x):
+ return paddle.sin(x) / x
+
+ # The zero is at the middle index
+ return paddle.concat([_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1 :])])
+
+ # Compute a low-pass filter with cutoff frequency notch_freq.
+ hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
+ hlpf *= blackman_window(filter_width)
+ hlpf /= paddle.sum(hlpf)
+
+ # Compute a high-pass filter with cutoff frequency notch_freq.
+ hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
+ hhpf *= blackman_window(filter_width)
+ hhpf /= -paddle.sum(hhpf)
+ hhpf[pad] += 1
+
+ # Adding filters creates notch filter
+ return (hlpf + hhpf).view(1, -1, 1)
+
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py
new file mode 100644
index 000000000..1cbbe11af
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/speech_augmentation.py
@@ -0,0 +1,741 @@
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlespeech.s2t.models.wav2vec2.speechbrain.processing.signal_processing import (
+ compute_amplitude,
+ convolve1d,
+ notch_filter)
+import pdb
+class SpeedPerturb(nn.Layer):
+ """Slightly speed up or slow down an audio signal.
+ Resample the audio signal at a rate that is similar to the original rate,
+ to achieve a slightly slower or slightly faster signal. This technique is
+ outlined in the paper: "Audio Augmentation for Speech Recognition"
+ Arguments
+ ---------
+ orig_freq : int
+ The frequency of the original signal.
+ speeds : list
+ The speeds that the signal should be changed to, as a percentage of the
+ original signal (i.e. `speeds` is divided by 100 to get a ratio).
+ perturb_prob : float
+ The chance that the batch will be speed-
+ perturbed. By default, every batch is perturbed.
+ Example
+ -------
+ >>> from speechbrain.dataio.dataio import read_audio
+ >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+ >>> perturbator = SpeedPerturb(orig_freq=16000, speeds=[90])
+ >>> clean = signal.unsqueeze(0)
+ >>> perturbed = perturbator(clean)
+ >>> clean.shape
+ torch.Size([1, 52173])
+ >>> perturbed.shape
+ torch.Size([1, 46956])
+ """
+
+ def __init__(
+ self, orig_freq, speeds=[90, 100, 110], perturb_prob=1.0,
+ ):
+ super().__init__()
+ self.orig_freq = orig_freq
+ self.speeds = speeds
+ self.perturb_prob = perturb_prob
+
+ # Initialize index of perturbation
+ self.samp_index = 0
+
+ # Initialize resamplers
+ self.resamplers = []
+ for speed in self.speeds:
+ config = {
+ "orig_freq": self.orig_freq,
+ "new_freq": self.orig_freq * speed // 100,
+ }
+ self.resamplers.append(Resample(**config))
+
+ def forward(self, waveform):
+ """
+ Arguments
+ ---------
+ waveforms : tensor
+ Shape should be `[batch, time]` or `[batch, time, channels]`.
+ lengths : tensor
+ Shape should be a single dimension, `[batch]`.
+ Returns
+ -------
+ Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+ """
+
+ # Don't perturb (return early) 1-`perturb_prob` portion of the batches
+ if paddle.rand([1]) > self.perturb_prob:
+
+ return waveform.clone()
+ # Perform a random perturbation
+ self.samp_index = paddle.randint(len(self.speeds), shape=(1,))[0]
+ perturbed_waveform = self.resamplers[self.samp_index](waveform)
+
+ return perturbed_waveform
+
+class Resample(nn.Layer):
+ """This class resamples an audio signal using sinc-based interpolation.
+
+ It is a modification of the `resample` function from torchaudio
+ (https://pytorch.org/audio/stable/tutorials/audio_resampling_tutorial.html)
+
+ Arguments
+ ---------
+ orig_freq : int
+ the sampling frequency of the input signal.
+ new_freq : int
+ the new sampling frequency after this operation is performed.
+ lowpass_filter_width : int
+ Controls the sharpness of the filter, larger numbers result in a
+ sharper filter, but they are less efficient. Values from 4 to 10 are allowed.
+
+ Example
+ -------
+ >>> from speechbrain.dataio.dataio import read_audio
+ >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+ >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+ >>> resampler = Resample(orig_freq=16000, new_freq=8000)
+ >>> resampled = resampler(signal)
+ >>> signal.shape
+ torch.Size([1, 52173])
+ >>> resampled.shape
+ torch.Size([1, 26087])
+ """
+
+ def __init__(
+ self, orig_freq=16000, new_freq=16000, lowpass_filter_width=6,
+ ):
+ super().__init__()
+ self.orig_freq = orig_freq
+ self.new_freq = new_freq
+ self.lowpass_filter_width = lowpass_filter_width
+
+ # Compute rate for striding
+ self._compute_strides()
+ assert self.orig_freq % self.conv_stride == 0
+ assert self.new_freq % self.conv_transpose_stride == 0
+
+ def _compute_strides(self):
+ """Compute the phases in polyphase filter.
+
+ (almost directly from torchaudio.compliance.kaldi)
+ """
+
+ # Compute new unit based on ratio of in/out frequencies
+ base_freq = math.gcd(self.orig_freq, self.new_freq)
+ input_samples_in_unit = self.orig_freq // base_freq
+ self.output_samples = self.new_freq // base_freq
+
+ # Store the appropriate stride based on the new units
+ self.conv_stride = input_samples_in_unit
+ self.conv_transpose_stride = self.output_samples
+
+ def forward(self, waveforms):
+ """
+ Arguments
+ ---------
+ waveforms : tensor
+ Shape should be `[batch, time]` or `[batch, time, channels]`.
+ lengths : tensor
+ Shape should be a single dimension, `[batch]`.
+
+ Returns
+ -------
+ Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+ """
+
+ if not hasattr(self, "first_indices"):
+ self._indices_and_weights(waveforms)
+
+ # Don't do anything if the frequencies are the same
+ if self.orig_freq == self.new_freq:
+ return waveforms
+ unsqueezed = False
+ if len(waveforms.shape) == 2:
+ waveforms = waveforms.unsqueeze(1)
+ unsqueezed = True
+ elif len(waveforms.shape) == 3:
+ waveforms = waveforms.transpose([0, 2, 1])
+ else:
+ raise ValueError("Input must be 2 or 3 dimensions")
+
+ # Do resampling
+ resampled_waveform = self._perform_resample(waveforms)
+
+ if unsqueezed:
+ resampled_waveform = resampled_waveform.squeeze(1)
+ else:
+ resampled_waveform = resampled_waveform.transpose([0, 2, 1])
+
+ return resampled_waveform
+
+ def _perform_resample(self, waveforms):
+ """Resamples the waveform at the new frequency.
+
+ This matches Kaldi's OfflineFeatureTpl ResampleWaveform which uses a
+ LinearResample (resample a signal at linearly spaced intervals to
+ up/downsample a signal). LinearResample (LR) means that the output
+ signal is at linearly spaced intervals (i.e the output signal has a
+ frequency of `new_freq`). It uses sinc/bandlimited interpolation to
+ upsample/downsample the signal.
+
+ (almost directly from torchaudio.compliance.kaldi)
+
+ https://ccrma.stanford.edu/~jos/resample/
+ Theory_Ideal_Bandlimited_Interpolation.html
+
+ https://github.com/kaldi-asr/kaldi/blob/master/src/feat/resample.h#L56
+
+ Arguments
+ ---------
+ waveforms : tensor
+ The batch of audio signals to resample.
+
+ Returns
+ -------
+ The waveforms at the new frequency.
+ """
+
+ # Compute output size and initialize
+ batch_size, num_channels, wave_len = waveforms.shape
+ window_size = self.weights.shape[1]
+ tot_output_samp = self._output_samples(wave_len)
+ resampled_waveform = paddle.zeros(
+ (batch_size, num_channels, tot_output_samp)
+ )
+ # self.weights = self.weights.to(waveforms.device)
+
+ # Check weights are on correct device
+ # if waveforms.device != self.weights.device:
+ # self.weights = self.weights.to(waveforms.device)
+
+ # eye size: (num_channels, num_channels, 1)
+ eye = paddle.eye(num_channels).unsqueeze(2)
+
+ # Iterate over the phases in the polyphase filter
+ for i in range(self.first_indices.shape[0]):
+ wave_to_conv = waveforms
+ first_index = int(self.first_indices[i].item())
+ if first_index >= 0:
+ # trim the signal as the filter will not be applied
+ # before the first_index
+ wave_to_conv = wave_to_conv[..., first_index:]
+
+ # pad the right of the signal to allow partial convolutions
+ # meaning compute values for partial windows (e.g. end of the
+ # window is outside the signal length)
+ max_index = (tot_output_samp - 1) // self.output_samples
+ end_index = max_index * self.conv_stride + window_size
+ current_wave_len = wave_len - first_index
+ right_padding = max(0, end_index + 1 - current_wave_len)
+ left_padding = max(0, -first_index)
+ wave_to_conv = paddle.nn.functional.pad(
+ wave_to_conv, (left_padding, right_padding), data_format='NCL'
+ )
+ conv_wave = paddle.nn.functional.conv1d(
+ x=wave_to_conv,
+ weight=self.weights[i].repeat(num_channels, 1, 1),
+ stride=self.conv_stride,
+ groups=num_channels,
+ )
+
+ # we want conv_wave[:, i] to be at
+ # output[:, i + n*conv_transpose_stride]
+ dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
+ conv_wave, eye, stride=self.conv_transpose_stride
+ )
+
+ # pad dilated_conv_wave so it reaches the output length if needed.
+ left_padding = i
+ previous_padding = left_padding + dilated_conv_wave.shape[-1]
+ right_padding = max(0, tot_output_samp - previous_padding)
+ dilated_conv_wave = paddle.nn.functional.pad(
+ dilated_conv_wave, (left_padding, right_padding), data_format='NCL'
+ )
+ dilated_conv_wave = dilated_conv_wave[..., :tot_output_samp]
+
+ resampled_waveform += dilated_conv_wave
+
+ return resampled_waveform
+
+ def _output_samples(self, input_num_samp):
+ """Based on LinearResample::GetNumOutputSamples.
+
+ LinearResample (LR) means that the output signal is at
+ linearly spaced intervals (i.e the output signal has a
+ frequency of ``new_freq``). It uses sinc/bandlimited
+ interpolation to upsample/downsample the signal.
+
+ (almost directly from torchaudio.compliance.kaldi)
+
+ Arguments
+ ---------
+ input_num_samp : int
+ The number of samples in each example in the batch.
+
+ Returns
+ -------
+ Number of samples in the output waveform.
+ """
+
+ # For exact computation, we measure time in "ticks" of 1.0 / tick_freq,
+ # where tick_freq is the least common multiple of samp_in and
+ # samp_out.
+ samp_in = int(self.orig_freq)
+ samp_out = int(self.new_freq)
+
+ tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
+ ticks_per_input_period = tick_freq // samp_in
+
+ # work out the number of ticks in the time interval
+ # [ 0, input_num_samp/samp_in ).
+ interval_length = input_num_samp * ticks_per_input_period
+ if interval_length <= 0:
+ return 0
+ ticks_per_output_period = tick_freq // samp_out
+
+ # Get the last output-sample in the closed interval,
+ # i.e. replacing [ ) with [ ]. Note: integer division rounds down.
+ # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
+ # explanation of the notation.
+ last_output_samp = interval_length // ticks_per_output_period
+
+ # We need the last output-sample in the open interval, so if it
+ # takes us to the end of the interval exactly, subtract one.
+ if last_output_samp * ticks_per_output_period == interval_length:
+ last_output_samp -= 1
+
+ # First output-sample index is zero, so the number of output samples
+ # is the last output-sample plus one.
+ num_output_samp = last_output_samp + 1
+
+ return num_output_samp
+
+ def _indices_and_weights(self, waveforms):
+ """Based on LinearResample::SetIndexesAndWeights
+
+ Retrieves the weights for resampling as well as the indices in which
+ they are valid. LinearResample (LR) means that the output signal is at
+ linearly spaced intervals (i.e the output signal has a frequency
+ of ``new_freq``). It uses sinc/bandlimited interpolation to
+ upsample/downsample the signal.
+
+ Returns
+ -------
+ - the place where each filter should start being applied
+ - the filters to be applied to the signal for resampling
+ """
+
+ # Lowpass filter frequency depends on smaller of two frequencies
+ min_freq = min(self.orig_freq, self.new_freq)
+ lowpass_cutoff = 0.99 * 0.5 * min_freq
+
+ assert lowpass_cutoff * 2 <= min_freq
+ window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
+
+ assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
+ output_t = paddle.arange(
+ start=0.0, end=self.output_samples
+ )
+ output_t /= self.new_freq
+ min_t = output_t - window_width
+ max_t = output_t + window_width
+
+ min_input_index = paddle.ceil(min_t * self.orig_freq)
+ max_input_index = paddle.floor(max_t * self.orig_freq)
+ num_indices = max_input_index - min_input_index + 1
+
+ max_weight_width = num_indices.max()
+ j = paddle.arange(max_weight_width)
+ input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
+ delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
+
+ weights = paddle.zeros_like(delta_t)
+
+ inside_window_indices = delta_t.abs() < (window_width)
+ # raised-cosine (Hanning) window with width `window_width`
+ weights[inside_window_indices] = 0.5 * (
+ 1
+ + paddle.cos(
+ 2
+ * math.pi
+ * lowpass_cutoff
+ / self.lowpass_filter_width
+ * delta_t[inside_window_indices]
+ )
+ )
+ t_eq_zero_indices = delta_t == 0.0
+ t_not_eq_zero_indices = ~t_eq_zero_indices
+
+ # sinc filter function
+ weights[t_not_eq_zero_indices] *= paddle.sin(
+ 2 * math.pi * lowpass_cutoff * delta_t[t_not_eq_zero_indices]
+ ) / (math.pi * delta_t[t_not_eq_zero_indices])
+
+ # limit of the function at t = 0
+ weights[t_eq_zero_indices] *= 2 * lowpass_cutoff
+
+ # size (output_samples, max_weight_width)
+ weights /= self.orig_freq
+
+ self.first_indices = min_input_index
+ self.weights = weights
+
+
+class DropFreq(nn.Layer):
+ """This class drops a random frequency from the signal.
+ The purpose of this class is to teach models to learn to rely on all parts
+ of the signal, not just a few frequency bands.
+ Arguments
+ ---------
+ drop_freq_low : float
+ The low end of frequencies that can be dropped,
+ as a fraction of the sampling rate / 2.
+ drop_freq_high : float
+ The high end of frequencies that can be
+ dropped, as a fraction of the sampling rate / 2.
+ drop_count_low : int
+ The low end of number of frequencies that could be dropped.
+ drop_count_high : int
+ The high end of number of frequencies that could be dropped.
+ drop_width : float
+ The width of the frequency band to drop, as
+ a fraction of the sampling_rate / 2.
+ drop_prob : float
+ The probability that the batch of signals will have a frequency
+ dropped. By default, every batch has frequencies dropped.
+ Example
+ -------
+ >>> from speechbrain.dataio.dataio import read_audio
+ >>> dropper = DropFreq()
+ >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+ >>> dropped_signal = dropper(signal.unsqueeze(0))
+ """
+
+ def __init__(
+ self,
+ drop_freq_low=1e-14,
+ drop_freq_high=1,
+ drop_count_low=1,
+ drop_count_high=2,
+ drop_width=0.05,
+ drop_prob=1,
+ ):
+ super().__init__()
+ self.drop_freq_low = drop_freq_low
+ self.drop_freq_high = drop_freq_high
+ self.drop_count_low = drop_count_low
+ self.drop_count_high = drop_count_high
+ self.drop_width = drop_width
+ self.drop_prob = drop_prob
+
+ def forward(self, waveforms):
+ """
+ Arguments
+ ---------
+ waveforms : tensor
+ Shape should be `[batch, time]` or `[batch, time, channels]`.
+ Returns
+ -------
+ Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+ """
+
+ # Don't drop (return early) 1-`drop_prob` portion of the batches
+ dropped_waveform = waveforms.clone()
+ if paddle.rand([1]) > self.drop_prob:
+ return dropped_waveform
+
+ # Add channels dimension
+ if len(waveforms.shape) == 2:
+ dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+ # Pick number of frequencies to drop
+ drop_count = paddle.randint(
+ low=self.drop_count_low, high=self.drop_count_high + 1, shape=(1,),
+ )
+
+ # Pick a frequency to drop
+ drop_range = self.drop_freq_high - self.drop_freq_low
+ drop_frequency = (
+ paddle.rand(drop_count) * drop_range + self.drop_freq_low
+ )
+
+ # Filter parameters
+ filter_length = 101
+ pad = filter_length // 2
+
+ # Start with delta function
+ drop_filter = paddle.zeros([1, filter_length, 1])
+ drop_filter[0, pad, 0] = 1
+ # Subtract each frequency
+ for frequency in drop_frequency:
+ notch_kernel = notch_filter(
+ frequency, filter_length, self.drop_width,
+ )
+ drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+ # Apply filter
+ dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+ # Remove channels dimension if added
+ return dropped_waveform.squeeze(-1)
+
+class DropChunk(nn.Layer):
+ """This class drops portions of the input signal.
+ Using `DropChunk` as an augmentation strategy helps a models learn to rely
+ on all parts of the signal, since it can't expect a given part to be
+ present.
+ Arguments
+ ---------
+ drop_length_low : int
+ The low end of lengths for which to set the
+ signal to zero, in samples.
+ drop_length_high : int
+ The high end of lengths for which to set the
+ signal to zero, in samples.
+ drop_count_low : int
+ The low end of number of times that the signal
+ can be dropped to zero.
+ drop_count_high : int
+ The high end of number of times that the signal
+ can be dropped to zero.
+ drop_start : int
+ The first index for which dropping will be allowed.
+ drop_end : int
+ The last index for which dropping will be allowed.
+ drop_prob : float
+ The probability that the batch of signals will
+ have a portion dropped. By default, every batch
+ has portions dropped.
+ noise_factor : float
+ The factor relative to average amplitude of an utterance
+ to use for scaling the white noise inserted. 1 keeps
+ the average amplitude the same, while 0 inserts all 0's.
+ Example
+ -------
+ >>> from speechbrain.dataio.dataio import read_audio
+ >>> dropper = DropChunk(drop_start=100, drop_end=200, noise_factor=0.)
+ >>> signal = read_audio('tests/samples/single-mic/example1.wav')
+ >>> signal = signal.unsqueeze(0) # [batch, time, channels]
+ >>> length = torch.ones(1)
+ >>> dropped_signal = dropper(signal, length)
+ >>> float(dropped_signal[:, 150])
+ 0.0
+ """
+
+ def __init__(
+ self,
+ drop_length_low=100,
+ drop_length_high=1000,
+ drop_count_low=1,
+ drop_count_high=10,
+ drop_start=0,
+ drop_end=None,
+ drop_prob=1,
+ noise_factor=0.0,
+ ):
+ super().__init__()
+ self.drop_length_low = drop_length_low
+ self.drop_length_high = drop_length_high
+ self.drop_count_low = drop_count_low
+ self.drop_count_high = drop_count_high
+ self.drop_start = drop_start
+ self.drop_end = drop_end
+ self.drop_prob = drop_prob
+ self.noise_factor = noise_factor
+
+ # Validate low < high
+ if drop_length_low > drop_length_high:
+ raise ValueError("Low limit must not be more than high limit")
+ if drop_count_low > drop_count_high:
+ raise ValueError("Low limit must not be more than high limit")
+
+ # Make sure the length doesn't exceed end - start
+ if drop_end is not None and drop_end >= 0:
+ if drop_start > drop_end:
+ raise ValueError("Low limit must not be more than high limit")
+
+ drop_range = drop_end - drop_start
+ self.drop_length_low = min(drop_length_low, drop_range)
+ self.drop_length_high = min(drop_length_high, drop_range)
+
+ def forward(self, waveforms, lengths):
+ """
+ Arguments
+ ---------
+ waveforms : tensor
+ Shape should be `[batch, time]` or `[batch, time, channels]`.
+ lengths : tensor
+ Shape should be a single dimension, `[batch]`.
+ Returns
+ -------
+ Tensor of shape `[batch, time]` or
+ `[batch, time, channels]`
+ """
+
+ # Reading input list
+ lengths = (lengths * waveforms.shape[1]).long()
+ batch_size = waveforms.shape[0]
+ dropped_waveform = waveforms.clone()
+
+ # Don't drop (return early) 1-`drop_prob` portion of the batches
+ if paddle.rand([1]) > self.drop_prob:
+ return dropped_waveform
+
+ # Store original amplitude for computing white noise amplitude
+ clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+ # Pick a number of times to drop
+ drop_times = paddle.randint(
+ low=self.drop_count_low,
+ high=self.drop_count_high + 1,
+ shape=(batch_size,),
+ )
+
+ # Iterate batch to set mask
+ for i in range(batch_size):
+ if drop_times[i] == 0:
+ continue
+
+ # Pick lengths
+ length = paddle.randint(
+ low=self.drop_length_low,
+ high=self.drop_length_high + 1,
+ shape=(drop_times[i],),
+ )
+
+ # Compute range of starting locations
+ start_min = self.drop_start
+ if start_min < 0:
+ start_min += lengths[i]
+ start_max = self.drop_end
+ if start_max is None:
+ start_max = lengths[i]
+ if start_max < 0:
+ start_max += lengths[i]
+ start_max = max(0, start_max - length.max())
+
+ # Pick starting locations
+ start = paddle.randint(
+ low=start_min, high=start_max + 1, shape=(drop_times[i],),
+ )
+
+ end = start + length
+
+ # Update waveform
+ if not self.noise_factor:
+ for j in range(drop_times[i]):
+ dropped_waveform[i, start[j] : end[j]] = 0.0
+ else:
+ # Uniform distribution of -2 to +2 * avg amplitude should
+ # preserve the average for normalization
+ noise_max = 2 * clean_amplitude[i] * self.noise_factor
+ for j in range(drop_times[i]):
+ # zero-center the noise distribution
+ noise_vec = paddle.rand(length[j])
+ noise_vec = 2 * noise_max * noise_vec - noise_max
+ dropped_waveform[i, start[j] : end[j]] = noise_vec
+
+ return dropped_waveform
+
+
+class TimeDomainSpecAugment(nn.Layer):
+ """A time-domain approximation of the SpecAugment algorithm.
+
+ This augmentation module implements three augmentations in
+ the time-domain.
+
+ 1. Drop chunks of the audio (zero amplitude or white noise)
+ 2. Drop frequency bands (with band-drop filters)
+ 3. Speed peturbation (via resampling to slightly different rate)
+
+ Arguments
+ ---------
+ perturb_prob : float from 0 to 1
+ The probability that a batch will have speed perturbation applied.
+ drop_freq_prob : float from 0 to 1
+ The probability that a batch will have frequencies dropped.
+ drop_chunk_prob : float from 0 to 1
+ The probability that a batch will have chunks dropped.
+ speeds : list of ints
+ A set of different speeds to use to perturb each batch.
+ See ``speechbrain.processing.speech_augmentation.SpeedPerturb``
+ sample_rate : int
+ Sampling rate of the input waveforms.
+ drop_freq_count_low : int
+ Lowest number of frequencies that could be dropped.
+ drop_freq_count_high : int
+ Highest number of frequencies that could be dropped.
+ drop_chunk_count_low : int
+ Lowest number of chunks that could be dropped.
+ drop_chunk_count_high : int
+ Highest number of chunks that could be dropped.
+ drop_chunk_length_low : int
+ Lowest length of chunks that could be dropped.
+ drop_chunk_length_high : int
+ Highest length of chunks that could be dropped.
+ drop_chunk_noise_factor : float
+ The noise factor used to scale the white noise inserted, relative to
+ the average amplitude of the utterance. Default 0 (no noise inserted).
+
+ Example
+ -------
+ >>> inputs = torch.randn([10, 16000])
+ >>> feature_maker = TimeDomainSpecAugment(speeds=[80])
+ >>> feats = feature_maker(inputs, torch.ones(10))
+ >>> feats.shape
+ torch.Size([10, 12800])
+ """
+
+ def __init__(
+ self,
+ perturb_prob=1.0,
+ drop_freq_prob=1.0,
+ drop_chunk_prob=1.0,
+ speeds=[95, 100, 105],
+ sample_rate=16000,
+ drop_freq_count_low=0,
+ drop_freq_count_high=3,
+ drop_chunk_count_low=0,
+ drop_chunk_count_high=5,
+ drop_chunk_length_low=1000,
+ drop_chunk_length_high=2000,
+ drop_chunk_noise_factor=0,
+ ):
+ super().__init__()
+ self.speed_perturb = SpeedPerturb(
+ perturb_prob=perturb_prob, orig_freq=sample_rate, speeds=speeds
+ )
+ self.drop_freq = DropFreq(
+ drop_prob=drop_freq_prob,
+ drop_count_low=drop_freq_count_low,
+ drop_count_high=drop_freq_count_high,
+ )
+ self.drop_chunk = DropChunk(
+ drop_prob=drop_chunk_prob,
+ drop_count_low=drop_chunk_count_low,
+ drop_count_high=drop_chunk_count_high,
+ drop_length_low=drop_chunk_length_low,
+ drop_length_high=drop_chunk_length_high,
+ noise_factor=drop_chunk_noise_factor,
+ )
+
+ def forward(self, waveforms, lengths):
+ """Returns the distorted waveforms.
+
+ Arguments
+ ---------
+ waveforms : torch.Tensor
+ The waveforms to distort
+ """
+ # Augmentation
+ with paddle.no_grad():
+ waveforms = self.speed_perturb(waveforms)
+ waveforms = self.drop_freq(waveforms)
+ waveforms = self.drop_chunk(waveforms, lengths)
+ return waveforms
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py
new file mode 100644
index 000000000..da243342c
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/speechbrain/processing/test.py
@@ -0,0 +1,14 @@
+import paddle
+import numpy as np
+
+def blackman_window(window_length, periodic=True):
+ if window_length == 0:
+ return []
+ if window_length == 1:
+ return paddle.ones([1])
+ if periodic:
+ window_length += 1
+
+ window = paddle.arange(window_length) * (np.pi / (window_length - 1))
+ window = 0.08 * paddle.cos(window * 4) - 0.5 * paddle.cos(window * 2) + 0.42
+ return window[:-1] if periodic else window
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
new file mode 100644
index 000000000..e20a7e129
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -0,0 +1,287 @@
+import numpy as np
+import os
+
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlespeech.s2t.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2ConfigPure
+from paddlespeech.s2t.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
+from paddlespeech.s2t.modules.mask import make_pad_mask
+from paddlespeech.s2t.utils.utility import log_add
+
+from collections import defaultdict
+
+from paddlespeech.s2t.models.wav2vec2.speechbrain.lobes.models.VanillaNN import VanillaNN
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
+from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
+from yacs.config import CfgNode
+
+class Wav2vec2ASR(nn.Layer):
+ def __init__(self, config: dict):
+ super().__init__()
+
+ wav2vec2_config = Wav2Vec2ConfigPure()
+ wav2vec2 = Wav2Vec2Model(wav2vec2_config)
+
+ model_dict = paddle.load(config.wav2vec2_params_path)
+ wav2vec2.set_state_dict(model_dict)
+ wav2vec2.eval()
+ self.normalize_wav = config.normalize_wav
+ self.output_norm = config.output_norm
+ if config.freeze_wav2vec2:
+ for parm in wav2vec2.parameters():
+ parm.trainable = False
+ self.wav2vec2 = wav2vec2
+ self.enc = VanillaNN(input_shape=[None,None,wav2vec2_config.hidden_size], activation=nn.LeakyReLU, dnn_blocks=config.dnn_blocks, dnn_neurons=config.dnn_neurons)
+ self.ctc = CTC(odim=config.output_dim, enc_n_units=config.dnn_neurons, blank_id=config.blank_id, dropout_rate=config.ctc_dropout_rate, reduction_type="mean")
+
+ def train_batch(self):
+ wav, wavs_lens_rate, target, target_lens_rate = self._get_data()
+ ctc_loss = self(wav, wavs_lens_rate, target, target_lens_rate)
+
+
+ def forward(self, wav, wavs_lens_rate, target, target_lens_rate):
+ if self.normalize_wav:
+ wav = F.layer_norm(wav, wav.shape[1:])
+ # Extract wav2vec output
+ out = self.wav2vec2(wav)[0]
+ np.save("data/out.npy", out.numpy())
+ # We normalize the output if required
+ if self.output_norm:
+ out = F.layer_norm(out, out.shape[1:])
+ feats = out
+
+ x = self.enc(feats)
+ x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64)
+ target_lens = (target_lens_rate * target.shape[1]).round().astype(paddle.int64)
+
+ ctc_loss = self.ctc(x, x_lens, target, target_lens)
+ return ctc_loss
+
+
+ @paddle.no_grad()
+ def decode(self,
+ feats: paddle.Tensor,
+ feats_lengths: paddle.Tensor,
+ text_feature: Dict[str, int],
+ decoding_method: str,
+ beam_size: int):
+ batch_size = feats.shape[0]
+ if decoding_method is 'ctc_prefix_beam_search' and batch_size > 1:
+ logger.error(
+ f'decoding mode {decoding_method} must be running with batch_size == 1'
+ )
+ logger.error(f"current batch_size is {batch_size}")
+ sys.exit(1)
+
+ if decoding_method == 'ctc_greedy_search':
+ hyps = self.ctc_greedy_search(feats, feats_lengths)
+ res = [text_feature.defeaturize(hyp) for hyp in hyps]
+ res_tokenids = [hyp for hyp in hyps]
+ # ctc_prefix_beam_search and attention_rescoring only return one
+ # result in List[int], change it to List[List[int]] for compatible
+ # with other batch decoding mode
+ elif decoding_method == 'ctc_prefix_beam_search':
+ assert feats.shape[0] == 1
+ hyp = self.ctc_prefix_beam_search(
+ feats,
+ feats_lengths,
+ beam_size)
+ res = [text_feature.defeaturize(hyp)]
+ res_tokenids = [hyp]
+ else:
+ raise ValueError(f"wav2vec2 not support decoding method: {decoding_method}")
+
+ return res, res_tokenids
+
+ @classmethod
+ def from_config(cls, config):
+ model = cls(config)
+ return model
+
+ def ctc_greedy_search(
+ self, wav, wavs_lens_rate) -> List[List[int]]:
+ """ Apply CTC greedy search
+ Args:
+ speech (paddle.Tensor): (batch, max_len)
+ speech_length (paddle.Tensor): (batch, )
+ Returns:
+ List[List[int]]: best path result
+ """
+ batch_size = wav.shape[0]
+ wav = wav[:,:,0]
+ if self.normalize_wav:
+ wav = F.layer_norm(wav, wav.shape[1:])
+ # Extract wav2vec output
+ out = self.wav2vec2(wav)[0]
+ # We normalize the output if required
+ if self.output_norm:
+ out = F.layer_norm(out, out.shape[1:])
+ feats = out
+ x = self.enc(feats)
+ x_lens = x.shape[1]
+ ctc_probs = self.ctc.log_softmax(x) # (B, maxlen, vocab_size)
+ topk_prob, topk_index = ctc_probs.topk(1, axis=2) # (B, maxlen, 1)
+ topk_index = topk_index.view(batch_size, x_lens) # (B, maxlen)
+ # pad_mask = make_pad_mask(x_lens) # (B, maxlen)
+ # topk_index = topk_index.masked_fill_(pad_mask, self.eos) # (B, maxlen)
+
+ hyps = [hyp.tolist() for hyp in topk_index]
+ hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
+ return hyps
+
+ def _ctc_prefix_beam_search(
+ self, wav, wavs_lens_rate, beam_size, blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]:
+ """ CTC prefix beam search inner implementation
+ Args:
+ speech (paddle.Tensor): (batch, max_len, feat_dim)
+ speech_length (paddle.Tensor): (batch, )
+ beam_size (int): beam size for beam search
+ decoding_chunk_size (int): decoding chunk for dynamic chunk
+ trained model.
+ <0: for decoding, use full chunk.
+ >0: for decoding, use fixed chunk size as set.
+ 0: used for training, it's prohibited here
+ simulate_streaming (bool): whether do encoder forward in a
+ streaming fashion
+ Returns:
+ List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
+ paddle.Tensor: encoder output, (1, max_len, encoder_dim),
+ it will be used for rescoring in attention rescoring mode
+ """
+ wav = wav[:,:,0]
+
+ if self.normalize_wav:
+ wav = F.layer_norm(wav, wav.shape[1:])
+ # Extract wav2vec output
+ out = self.wav2vec2(wav)[0]
+ # We normalize the output if required
+ if self.output_norm:
+ out = F.layer_norm(out, out.shape[1:])
+ feats = out
+
+ x = self.enc(feats)
+ maxlen = x.shape[1]
+ ctc_probs = self.ctc.log_softmax(x) # (1, maxlen, vocab_size)
+ ctc_probs = ctc_probs.squeeze(0)
+
+ # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
+ # blank_ending_score and none_blank_ending_score in ln domain
+ cur_hyps = [(tuple(), (0.0, -float('inf')))]
+ # 2. CTC beam search step by step
+ for t in range(0, maxlen):
+ logp = ctc_probs[t] # (vocab_size,)
+ # key: prefix, value (pb, pnb), default value(-inf, -inf)
+ next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
+ # 2.1 First beam prune: select topk best
+ top_k_logp, top_k_index = logp.topk(beam_size) # (beam_size,)
+ for s in top_k_index:
+ s = s.item()
+ ps = logp[s].item()
+ for prefix, (pb, pnb) in cur_hyps:
+ last = prefix[-1] if len(prefix) > 0 else None
+ if s == blank_id: # blank
+ n_pb, n_pnb = next_hyps[prefix]
+ n_pb = log_add([n_pb, pb + ps, pnb + ps])
+ next_hyps[prefix] = (n_pb, n_pnb)
+ elif s == last:
+ # Update *ss -> *s;
+ n_pb, n_pnb = next_hyps[prefix]
+ n_pnb = log_add([n_pnb, pnb + ps])
+ next_hyps[prefix] = (n_pb, n_pnb)
+ # Update *s-s -> *ss, - is for blank
+ n_prefix = prefix + (s, )
+ n_pb, n_pnb = next_hyps[n_prefix]
+ n_pnb = log_add([n_pnb, pb + ps])
+ next_hyps[n_prefix] = (n_pb, n_pnb)
+ else:
+ n_prefix = prefix + (s, )
+ n_pb, n_pnb = next_hyps[n_prefix]
+ n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
+ next_hyps[n_prefix] = (n_pb, n_pnb)
+
+ # 2.2 Second beam prune
+ next_hyps = sorted(
+ next_hyps.items(),
+ key=lambda x: log_add(list(x[1])),
+ reverse=True)
+ cur_hyps = next_hyps[:beam_size]
+
+ hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
+ return hyps
+
+ def ctc_prefix_beam_search(self, wav, wavs_lens_rate, beam_size) -> List[int]:
+ """ Apply CTC prefix beam search
+ Args:
+ speech (paddle.Tensor): (batch, max_len, feat_dim)
+ speech_length (paddle.Tensor): (batch, )
+ beam_size (int): beam size for beam search
+ decoding_chunk_size (int): decoding chunk for dynamic chunk
+ trained model.
+ <0: for decoding, use full chunk.
+ >0: for decoding, use fixed chunk size as set.
+ 0: used for training, it's prohibited here
+ simulate_streaming (bool): whether do encoder forward in a
+ streaming fashion
+ Returns:
+ List[int]: CTC prefix beam search nbest results
+ """
+ hyps = self._ctc_prefix_beam_search(
+ wav, wavs_lens_rate, beam_size)
+ return hyps[0][0]
+
+ # @jit.to_static
+ def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
+ """ Export interface for c++ call, apply linear transform and log
+ softmax before ctc
+ Args:
+ xs (paddle.Tensor): encoder output, (B, T, D)
+ Returns:
+ paddle.Tensor: activation before ctc
+ """
+ return self.ctc.log_softmax(xs)
+
+
+ def _get_data(self):
+ data_dir = "data"
+ wavs = np.load(os.path.join(data_dir, "wavs.npy"))
+ wavs_lens = np.load(os.path.join(data_dir, "wavs_lens.npy"))
+ tokens = np.load(os.path.join(data_dir, "tokens.npy"))
+ tokens_lens = np.load(os.path.join(data_dir, "tokens_lens.npy"))
+
+ batch = (paddle.to_tensor(wavs), paddle.to_tensor(wavs_lens, dtype='float32'),
+ paddle.to_tensor(tokens, dtype='int32'), paddle.to_tensor(tokens_lens, dtype='float32'))
+ return batch
+
+
+if __name__ == "__main__":
+ # wav2vec2_asr = Wav2vec2ASR(config={})
+ # wav2vec2_asr.train_batch()
+ freeze = True
+ config = Wav2Vec2ConfigPure()
+ model = Wav2Vec2Model(config)
+ model_dict = model.state_dict()
+ revise_params_path = "exp/torch_to_paddle_revise.pdparams"
+ model_dict_revise = paddle.load(revise_params_path)
+ model.set_state_dict(model_dict_revise)
+ model.training = True
+ model.eval()
+ if freeze:
+ for parm in model.parameters():
+ parm.requires_grad = False
+ # get enc()
+ enc = VanillaNN(input_shape=[None,None,1024], activation=paddle.nn.LeakyReLU, dnn_blocks=2, dnn_neurons=1024)
+
+ ctc = CTC(odim=30, enc_n_units=1024, blank_id=0, dropout_rate=0.0)
+
+ input_values = np.load("input_values.npy")
+ input_values = paddle.to_tensor(input_values)
+
+ feats = model(input_values).last_hidden_state
+ x = enc(feats)
+ ctc_loss = ctc(enc, target)
diff --git a/paddlespeech/s2t/modules/align.py b/paddlespeech/s2t/modules/align.py
index 34d796145..cacda2461 100644
--- a/paddlespeech/s2t/modules/align.py
+++ b/paddlespeech/s2t/modules/align.py
@@ -11,10 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import math
-
import paddle
from paddle import nn
+import math
"""
To align the initializer between paddle and torch,
the API below are set defalut initializer with priority higger than global initializer.
@@ -82,18 +81,10 @@ class Linear(nn.Linear):
name=None):
if weight_attr is None:
if global_init_type == "kaiming_uniform":
- weight_attr = paddle.ParamAttr(
- initializer=nn.initializer.KaimingUniform(
- fan_in=None,
- negative_slope=math.sqrt(5),
- nonlinearity='leaky_relu'))
+ weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
if bias_attr is None:
if global_init_type == "kaiming_uniform":
- bias_attr = paddle.ParamAttr(
- initializer=nn.initializer.KaimingUniform(
- fan_in=None,
- negative_slope=math.sqrt(5),
- nonlinearity='leaky_relu'))
+ bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
super(Linear, self).__init__(in_features, out_features, weight_attr,
bias_attr, name)
@@ -113,18 +104,10 @@ class Conv1D(nn.Conv1D):
data_format='NCL'):
if weight_attr is None:
if global_init_type == "kaiming_uniform":
- weight_attr = paddle.ParamAttr(
- initializer=nn.initializer.KaimingUniform(
- fan_in=None,
- negative_slope=math.sqrt(5),
- nonlinearity='leaky_relu'))
+ weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
if bias_attr is None:
if global_init_type == "kaiming_uniform":
- bias_attr = paddle.ParamAttr(
- initializer=nn.initializer.KaimingUniform(
- fan_in=None,
- negative_slope=math.sqrt(5),
- nonlinearity='leaky_relu'))
+ bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
super(Conv1D, self).__init__(
in_channels, out_channels, kernel_size, stride, padding, dilation,
groups, padding_mode, weight_attr, bias_attr, data_format)
@@ -145,18 +128,10 @@ class Conv2D(nn.Conv2D):
data_format='NCHW'):
if weight_attr is None:
if global_init_type == "kaiming_uniform":
- weight_attr = paddle.ParamAttr(
- initializer=nn.initializer.KaimingUniform(
- fan_in=None,
- negative_slope=math.sqrt(5),
- nonlinearity='leaky_relu'))
+ weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
if bias_attr is None:
if global_init_type == "kaiming_uniform":
- bias_attr = paddle.ParamAttr(
- initializer=nn.initializer.KaimingUniform(
- fan_in=None,
- negative_slope=math.sqrt(5),
- nonlinearity='leaky_relu'))
+ bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform(fan_in=None, negative_slope=math.sqrt(5), nonlinearity='leaky_relu'))
super(Conv2D, self).__init__(
in_channels, out_channels, kernel_size, stride, padding, dilation,
groups, padding_mode, weight_attr, bias_attr, data_format)
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 92990048d..b6d615867 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -15,6 +15,7 @@
# Modified from wenet(https://github.com/wenet-e2e/wenet)
"""Multi-Head Attention layer definition."""
import math
+from typing import Optional
from typing import Tuple
import paddle
@@ -82,12 +83,11 @@ class MultiHeadedAttention(nn.Layer):
return q, k, v
- def forward_attention(
- self,
- value: paddle.Tensor,
+ def forward_attention(self,
+ value: paddle.Tensor,
scores: paddle.Tensor,
- mask: paddle.Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool)
- ) -> paddle.Tensor:
+ mask: paddle.Tensor = paddle.ones([0, 0, 0], dtype=paddle.bool),
+ ) -> paddle.Tensor:
"""Compute attention context vector.
Args:
value (paddle.Tensor): Transformed value, size
@@ -108,7 +108,7 @@ class MultiHeadedAttention(nn.Layer):
# When will `if mask.size(2) > 0` be False?
# 1. onnx(16/-1, -1/-1, 16/0)
# 2. jit (16/-1, -1/-1, 16/0, 16/4)
- if paddle.shape(mask)[2] > 0: # time2 > 0
+ if paddle.shape(mask)[2] > 0: # time2 > 0
mask = mask.unsqueeze(1).equal(0) # (batch, 1, *, time2)
# for last chunk, time2 might be larger than scores.size(-1)
mask = mask[:, :, :, :paddle.shape(scores)[-1]]
@@ -127,15 +127,14 @@ class MultiHeadedAttention(nn.Layer):
return self.linear_out(x) # (batch, time1, d_model)
- def forward(
- self,
- query: paddle.Tensor,
- key: paddle.Tensor,
- value: paddle.Tensor,
- mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
- pos_emb: paddle.Tensor, # paddle.empty([0])
- cache: paddle.Tensor # paddle.zeros([0,0,0,0])
- ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ def forward(self,
+ query: paddle.Tensor,
+ key: paddle.Tensor,
+ value: paddle.Tensor,
+ mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
+ pos_emb: paddle.Tensor = paddle.empty([0]),
+ cache: paddle.Tensor = paddle.zeros([0,0,0,0])
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute scaled dot product attention.
Args:
query (paddle.Tensor): Query tensor (#batch, time1, size).
@@ -244,15 +243,14 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
return x
- def forward(
- self,
- query: paddle.Tensor,
- key: paddle.Tensor,
- value: paddle.Tensor,
- mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
- pos_emb: paddle.Tensor, # paddle.empty([0])
- cache: paddle.Tensor # paddle.zeros([0,0,0,0])
- ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ def forward(self,
+ query: paddle.Tensor,
+ key: paddle.Tensor,
+ value: paddle.Tensor,
+ mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
+ pos_emb: paddle.Tensor = paddle.empty([0]),
+ cache: paddle.Tensor = paddle.zeros([0,0,0,0])
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute 'Scaled Dot Product Attention' with rel. positional encoding.
Args:
query (paddle.Tensor): Query tensor (#batch, time1, size).
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index b35fea5b9..c384b9c78 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -14,6 +14,7 @@
# limitations under the License.
# Modified from wenet(https://github.com/wenet-e2e/wenet)
"""ConvolutionModule definition."""
+from typing import Optional
from typing import Tuple
import paddle
@@ -105,12 +106,11 @@ class ConvolutionModule(nn.Layer):
)
self.activation = activation
- def forward(
- self,
- x: paddle.Tensor,
- mask_pad: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
- cache: paddle.Tensor # paddle.zeros([0,0,0,0])
- ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ def forward(self,
+ x: paddle.Tensor,
+ mask_pad: paddle.Tensor= paddle.ones([0,0,0], dtype=paddle.bool),
+ cache: paddle.Tensor= paddle.zeros([0,0,0]),
+ ) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Compute convolution module.
Args:
x (paddle.Tensor): Input tensor (#batch, time, channels).
@@ -127,11 +127,11 @@ class ConvolutionModule(nn.Layer):
x = x.transpose([0, 2, 1]) # [B, C, T]
# mask batch padding
- if paddle.shape(mask_pad)[2] > 0: # time > 0
+ if paddle.shape(mask_pad)[2] > 0: # time > 0
x = x.masked_fill(mask_pad, 0.0)
if self.lorder > 0:
- if paddle.shape(cache)[2] == 0: # cache_t == 0
+ if paddle.shape(cache)[2] == 0: # cache_t == 0
x = nn.functional.pad(
x, [self.lorder, 0], 'constant', 0.0, data_format='NCL')
else:
@@ -161,7 +161,7 @@ class ConvolutionModule(nn.Layer):
x = self.pointwise_conv2(x)
# mask batch padding
- if paddle.shape(mask_pad)[2] > 0: # time > 0
+ if paddle.shape(mask_pad)[2] > 0: # time > 0
x = x.masked_fill(mask_pad, 0.0)
x = x.transpose([0, 2, 1]) # [B, T, C]
diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py
index 0f50db21d..652660e16 100644
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@@ -53,7 +53,7 @@ class CTCDecoderBase(nn.Layer):
enc_n_units,
blank_id=0,
dropout_rate: float=0.0,
- reduction: bool=True,
+ reduction_type: str="sum",
batch_average: bool=True,
grad_norm_type: Union[str, None]=None):
"""CTC decoder
@@ -73,7 +73,7 @@ class CTCDecoderBase(nn.Layer):
self.odim = odim
self.dropout = nn.Dropout(dropout_rate)
self.ctc_lo = Linear(enc_n_units, self.odim)
- reduction_type = "sum" if reduction else "none"
+ reduction_type = reduction_type if reduction_type else "none"
self.criterion = CTCLoss(
blank=self.blank_id,
reduction=reduction_type,
diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
index c8843b723..37b124e84 100644
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@@ -121,16 +121,11 @@ class DecoderLayer(nn.Layer):
if self.concat_after:
tgt_concat = paddle.cat(
- (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
- paddle.empty([0]),
- paddle.zeros([0, 0, 0, 0]))[0]),
- dim=-1)
+ (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0]), dim=-1)
x = residual + self.concat_linear1(tgt_concat)
else:
x = residual + self.dropout(
- self.self_attn(tgt_q, tgt, tgt, tgt_q_mask,
- paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[
- 0])
+ self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)[0])
if not self.normalize_before:
x = self.norm1(x)
@@ -139,15 +134,11 @@ class DecoderLayer(nn.Layer):
x = self.norm2(x)
if self.concat_after:
x_concat = paddle.cat(
- (x, self.src_attn(x, memory, memory, memory_mask,
- paddle.empty([0]),
- paddle.zeros([0, 0, 0, 0]))[0]),
- dim=-1)
+ (x, self.src_attn(x, memory, memory, memory_mask)[0]), dim=-1)
x = residual + self.concat_linear2(x_concat)
else:
x = residual + self.dropout(
- self.src_attn(x, memory, memory, memory_mask,
- paddle.empty([0]), paddle.zeros([0, 0, 0, 0]))[0])
+ self.src_attn(x, memory, memory, memory_mask)[0])
if not self.normalize_before:
x = self.norm2(x)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index cf4e32fa4..bff2d69bb 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -14,6 +14,8 @@
# limitations under the License.
# Modified from wenet(https://github.com/wenet-e2e/wenet)
"""Encoder definition."""
+from typing import List
+from typing import Optional
from typing import Tuple
import paddle
@@ -175,9 +177,7 @@ class BaseEncoder(nn.Layer):
decoding_chunk_size, self.static_chunk_size,
num_decoding_left_chunks)
for layer in self.encoders:
- xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad,
- paddle.zeros([0, 0, 0, 0]),
- paddle.zeros([0, 0, 0, 0]))
+ xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
if self.normalize_before:
xs = self.after_norm(xs)
# Here we assume the mask is not changed in encoder layers, so just
@@ -190,9 +190,9 @@ class BaseEncoder(nn.Layer):
xs: paddle.Tensor,
offset: int,
required_cache_size: int,
- att_cache: paddle.Tensor, # paddle.zeros([0,0,0,0])
- cnn_cache: paddle.Tensor, # paddle.zeros([0,0,0,0]),
- att_mask: paddle.Tensor, # paddle.ones([0,0,0], dtype=paddle.bool)
+ att_cache: paddle.Tensor = paddle.zeros([0,0,0,0]),
+ cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0]),
+ att_mask: paddle.Tensor = paddle.ones([0,0,0], dtype=paddle.bool),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
""" Forward just one chunk
Args:
@@ -227,7 +227,7 @@ class BaseEncoder(nn.Layer):
xs = self.global_cmvn(xs)
# before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
- xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
+ xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
# after embed, xs=(B=1, chunk_size, hidden-dim)
elayers = paddle.shape(att_cache)[0]
@@ -252,17 +252,14 @@ class BaseEncoder(nn.Layer):
# att_cache[i:i+1] = (1, head, cache_t1, d_k*2)
# cnn_cache[i:i+1] = (1, B=1, hidden-dim, cache_t2)
xs, _, new_att_cache, new_cnn_cache = layer(
- xs,
- att_mask,
- pos_emb,
- mask_pad=paddle.ones([0, 0, 0], dtype=paddle.bool),
- att_cache=att_cache[i:i + 1] if elayers > 0 else att_cache,
- cnn_cache=cnn_cache[i:i + 1]
- if paddle.shape(cnn_cache)[0] > 0 else cnn_cache, )
+ xs, att_mask, pos_emb,
+ att_cache=att_cache[i:i+1] if elayers > 0 else att_cache,
+ cnn_cache=cnn_cache[i:i+1] if paddle.shape(cnn_cache)[0] > 0 else cnn_cache,
+ )
# new_att_cache = (1, head, attention_key_size, d_k*2)
# new_cnn_cache = (B=1, hidden-dim, cache_t2)
- r_att_cache.append(new_att_cache[:, :, next_cache_start:, :])
- r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim
+ r_att_cache.append(new_att_cache[:,:, next_cache_start:, :])
+ r_cnn_cache.append(new_cnn_cache.unsqueeze(0)) # add elayer dim
if self.normalize_before:
xs = self.after_norm(xs)
@@ -273,6 +270,7 @@ class BaseEncoder(nn.Layer):
r_cnn_cache = paddle.concat(r_cnn_cache, axis=0)
return xs, r_att_cache, r_cnn_cache
+
def forward_chunk_by_chunk(
self,
xs: paddle.Tensor,
@@ -317,8 +315,8 @@ class BaseEncoder(nn.Layer):
num_frames = xs.shape[1]
required_cache_size = decoding_chunk_size * num_decoding_left_chunks
- att_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0])
- cnn_cache: paddle.Tensor = paddle.zeros([0, 0, 0, 0])
+ att_cache: paddle.Tensor = paddle.zeros([0,0,0,0])
+ cnn_cache: paddle.Tensor = paddle.zeros([0,0,0,0])
outputs = []
offset = 0
@@ -328,8 +326,7 @@ class BaseEncoder(nn.Layer):
chunk_xs = xs[:, cur:end, :]
(y, att_cache, cnn_cache) = self.forward_chunk(
- chunk_xs, offset, required_cache_size, att_cache, cnn_cache,
- paddle.ones([0, 0, 0], dtype=paddle.bool))
+ chunk_xs, offset, required_cache_size, att_cache, cnn_cache)
outputs.append(y)
offset += y.shape[1]
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index 4555b535f..5f810dfde 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -76,10 +76,9 @@ class TransformerEncoderLayer(nn.Layer):
x: paddle.Tensor,
mask: paddle.Tensor,
pos_emb: paddle.Tensor,
- mask_pad: paddle.
- Tensor, # paddle.ones([0, 0, 0], dtype=paddle.bool)
- att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
- cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
+ mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+ att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features.
Args:
@@ -106,8 +105,7 @@ class TransformerEncoderLayer(nn.Layer):
if self.normalize_before:
x = self.norm1(x)
- x_att, new_att_cache = self.self_attn(
- x, x, x, mask, paddle.empty([0]), cache=att_cache)
+ x_att, new_att_cache = self.self_attn(x, x, x, mask, cache=att_cache)
if self.concat_after:
x_concat = paddle.concat((x, x_att), axis=-1)
@@ -195,9 +193,9 @@ class ConformerEncoderLayer(nn.Layer):
x: paddle.Tensor,
mask: paddle.Tensor,
pos_emb: paddle.Tensor,
- mask_pad: paddle.Tensor, #paddle.ones([0, 0, 0],dtype=paddle.bool)
- att_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
- cnn_cache: paddle.Tensor, # paddle.zeros([0, 0, 0, 0])
+ mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+ att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+ cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute encoded features.
Args:
diff --git a/paddlespeech/s2t/modules/initializer.py b/paddlespeech/s2t/modules/initializer.py
index 6eae5713e..cdcf2e052 100644
--- a/paddlespeech/s2t/modules/initializer.py
+++ b/paddlespeech/s2t/modules/initializer.py
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
+import numpy as np
class DefaultInitializerContext(object):
"""
diff --git a/paddlespeech/s2t/training/optimizer.py b/paddlespeech/s2t/training/optimizer.py
index f7f70c570..75d3f5f5c 100644
--- a/paddlespeech/s2t/training/optimizer.py
+++ b/paddlespeech/s2t/training/optimizer.py
@@ -103,6 +103,8 @@ class OptimizerFactory():
grad_clip = ClipGradByGlobalNormWithLog(
args['grad_clip']) if "grad_clip" in args else None
+ # grad_clip = paddle.nn.ClipGradByGlobalNorm(
+ # args['grad_clip']) if "grad_clip" in args else None
weight_decay = L2Decay(
args['weight_decay']) if "weight_decay" in args else None
if weight_decay:
diff --git a/paddlespeech/s2t/training/scheduler.py b/paddlespeech/s2t/training/scheduler.py
index b22f7ef85..3464e2299 100644
--- a/paddlespeech/s2t/training/scheduler.py
+++ b/paddlespeech/s2t/training/scheduler.py
@@ -106,6 +106,59 @@ class ConstantLR(LRScheduler):
def get_lr(self):
return self.base_lr
+@register_scheduler
+class NewBobScheduler(LRScheduler):
+ """
+ Args:
+ learning_rate (float): The initial learning rate. It is a python float number.
+ last_epoch (int, optional): The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
+ verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+ Returns:
+ ``ConstantLR`` instance to schedule learning rate.
+ """
+ def __init__(
+ self,
+ learning_rate,
+ annealing_factor=0.5,
+ improvement_threshold=0.0025,
+ patient=0,
+ ):
+ self.hyperparam_value = learning_rate
+ self.annealing_factor = annealing_factor
+ self.improvement_threshold = improvement_threshold
+ self.patient = patient
+ self.metric_values = []
+ self.current_patient = self.patient
+
+ def __call__(self, metric_value):
+ """Returns the current and new value for the hyperparameter.
+
+ Arguments
+ ---------
+ metric_value : int
+ A number for determining whether to change the hyperparameter value.
+ """
+ old_value = new_value = self.hyperparam_value
+ if len(self.metric_values) > 0:
+ prev_metric = self.metric_values[-1]
+ # Update value if improvement too small and patience is 0
+ if prev_metric == 0: # Prevent division by zero
+ improvement = 0
+ else:
+ improvement = (prev_metric - metric_value) / prev_metric
+ if improvement < self.improvement_threshold:
+ if self.current_patient == 0:
+ new_value *= self.annealing_factor
+ self.current_patient = self.patient
+ else:
+ self.current_patient -= 1
+ # Store relevant info
+ self.metric_values.append(metric_value)
+ self.hyperparam_value = new_value
+
+ return old_value, new_value
+
def dynamic_import_scheduler(module):
"""Import Scheduler class dynamically.
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index a7eb9892d..815b61e0f 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -19,6 +19,8 @@ from pathlib import Path
import paddle
from paddle import distributed as dist
+dist.init_parallel_env()
+
from visualdl import LogWriter
from paddlespeech.s2t.training.reporter import ObsScope
@@ -130,7 +132,9 @@ class Trainer():
latest_n=self.config.checkpoint.latest_n)
# set random seed if needed
+ print(args.seed)
if args.seed:
+ print('***********')
seed_all(args.seed)
logger.info(f"Set seed {args.seed}")
@@ -176,7 +180,7 @@ class Trainer():
def init_parallel(self):
"""Init environment for multiprocess training.
"""
- dist.init_parallel_env()
+ # dist.init_parallel_env()
@mp_tools.rank_zero_only
def save(self, tag=None, infos: dict=None):
diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index 55f241ec7..8650154e9 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -25,7 +25,6 @@ asr_python:
cfg_path: # [optional]
ckpt_path: # [optional]
decode_method: 'attention_rescoring'
- num_decoding_left_chunks: -1
force_yes: True
device: # set 'gpu:id' or 'cpu'
@@ -39,7 +38,6 @@ asr_inference:
lang: 'zh'
sample_rate: 16000
cfg_path:
- num_decoding_left_chunks: -1
decode_method:
force_yes: True
diff --git a/paddlespeech/server/engine/asr/online/ctc_endpoint.py b/paddlespeech/server/engine/asr/online/ctc_endpoint.py
index 1b8ad1cb7..b87dbe805 100644
--- a/paddlespeech/server/engine/asr/online/ctc_endpoint.py
+++ b/paddlespeech/server/engine/asr/online/ctc_endpoint.py
@@ -102,10 +102,8 @@ class OnlineCTCEndpoint:
assert self.num_frames_decoded >= self.trailing_silence_frames
assert self.frame_shift_in_ms > 0
-
- decoding_something = (
- self.num_frames_decoded > self.trailing_silence_frames
- ) and decoding_something
+
+ decoding_something = (self.num_frames_decoded > self.trailing_silence_frames) and decoding_something
utterance_length = self.num_frames_decoded * self.frame_shift_in_ms
trailing_silence = self.trailing_silence_frames * self.frame_shift_in_ms
diff --git a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
index 6daae5be3..ab4f11305 100644
--- a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
@@ -21,12 +21,12 @@ import paddle
from numpy import float32
from yacs.config import CfgNode
-from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.log import logger
from paddlespeech.resource import CommonTaskResource
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.utils.utility import UpdateConfig
from paddlespeech.server.engine.base_engine import BaseEngine
from paddlespeech.server.utils import onnx_infer
diff --git a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
index 0fd5d1bc6..182e64180 100644
--- a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
@@ -21,10 +21,10 @@ import paddle
from numpy import float32
from yacs.config import CfgNode
-from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.log import logger
from paddlespeech.resource import CommonTaskResource
+from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.utils.utility import UpdateConfig
diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py
index 87d88ee60..4df38f09d 100644
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -21,10 +21,10 @@ import paddle
from numpy import float32
from yacs.config import CfgNode
-from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.cli.asr.infer import ASRExecutor
from paddlespeech.cli.log import logger
from paddlespeech.resource import CommonTaskResource
+from paddlespeech.audio.transform.transformation import Transformation
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.modules.ctc import CTCDecoder
from paddlespeech.s2t.utils.tensor_utils import add_sos_eos
@@ -130,8 +130,8 @@ class PaddleASRConnectionHanddler:
## conformer
# cache for conformer online
- self.att_cache = paddle.zeros([0, 0, 0, 0])
- self.cnn_cache = paddle.zeros([0, 0, 0, 0])
+ self.att_cache = paddle.zeros([0,0,0,0])
+ self.cnn_cache = paddle.zeros([0,0,0,0])
self.encoder_out = None
# conformer decoding state
@@ -474,14 +474,9 @@ class PaddleASRConnectionHanddler:
# cur chunk
chunk_xs = self.cached_feat[:, cur:end, :]
# forward chunk
- (y, self.att_cache,
- self.cnn_cache) = self.model.encoder.forward_chunk(
- chunk_xs,
- self.offset,
- required_cache_size,
- att_cache=self.att_cache,
- cnn_cache=self.cnn_cache,
- att_mask=paddle.ones([0, 0, 0], dtype=paddle.bool))
+ (y, self.att_cache, self.cnn_cache) = self.model.encoder.forward_chunk(
+ chunk_xs, self.offset, required_cache_size,
+ self.att_cache, self.cnn_cache)
outputs.append(y)
# update the global offset, in decoding frame unit
diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py
index e297e5c21..02c40fd12 100644
--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@@ -68,12 +68,9 @@ class ASREngine(BaseEngine):
return False
self.executor._init_from_path(
- model_type=self.config.model,
- lang=self.config.lang,
- sample_rate=self.config.sample_rate,
- cfg_path=self.config.cfg_path,
- decode_method=self.config.decode_method,
- ckpt_path=self.config.ckpt_path)
+ self.config.model, self.config.lang, self.config.sample_rate,
+ self.config.cfg_path, self.config.decode_method,
+ self.config.ckpt_path)
logger.info("Initialize ASR server engine successfully on device: %s." %
(self.device))
diff --git a/paddlespeech/server/engine/vector/python/vector_engine.py b/paddlespeech/server/engine/vector/python/vector_engine.py
index 7b8f667db..f7d60648d 100644
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
@@ -105,8 +105,7 @@ class PaddleVectorConnectionHandler:
# we can not reuse the cache io.BytesIO(audio) data,
# because the soundfile will change the io.BytesIO(audio) to the end
# thus we should convert the base64 string to io.BytesIO when we need the audio data
- if not self.executor._check(
- io.BytesIO(audio), sample_rate, force_yes=True):
+ if not self.executor._check(io.BytesIO(audio), sample_rate):
logger.debug("check the audio sample rate occurs error")
return np.array([0.0])
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
index c00648b1f..2cb7a11a2 100644
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -11,12 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+from typing import Collection
+from typing import Dict
+from typing import List
+from typing import Tuple
+
import numpy as np
import paddle
from paddlespeech.t2s.datasets.batch import batch_sequences
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
from paddlespeech.t2s.modules.nets_utils import get_seg_pos
from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import pad_list
from paddlespeech.t2s.modules.nets_utils import phones_masking
from paddlespeech.t2s.modules.nets_utils import phones_text_masking
@@ -485,56 +492,180 @@ def vits_single_spk_batch_fn(examples):
return batch
-def vits_multi_spk_batch_fn(examples):
- """
- Returns:
- Dict[str, Any]:
- - text (Tensor): Text index tensor (B, T_text).
- - text_lengths (Tensor): Text length tensor (B,).
- - feats (Tensor): Feature tensor (B, T_feats, aux_channels).
- - feats_lengths (Tensor): Feature length tensor (B,).
- - speech (Tensor): Speech waveform tensor (B, T_wav).
- - spk_id (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
- - spk_emb (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
- """
- # fields = ["text", "text_lengths", "feats", "feats_lengths", "speech", "spk_id"/"spk_emb"]
- text = [np.array(item["text"], dtype=np.int64) for item in examples]
- feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
- speech = [np.array(item["wave"], dtype=np.float32) for item in examples]
- text_lengths = [
- np.array(item["text_lengths"], dtype=np.int64) for item in examples
- ]
- feats_lengths = [
- np.array(item["feats_lengths"], dtype=np.int64) for item in examples
- ]
+# for ERNIE SAT
+class MLMCollateFn:
+ """Functor class of common_collate_fn()"""
- text = batch_sequences(text)
- feats = batch_sequences(feats)
- speech = batch_sequences(speech)
+ def __init__(
+ self,
+ feats_extract,
+ mlm_prob: float=0.8,
+ mean_phn_span: int=8,
+ seg_emb: bool=False,
+ text_masking: bool=False,
+ attention_window: int=0,
+ not_sequence: Collection[str]=(), ):
+ self.mlm_prob = mlm_prob
+ self.mean_phn_span = mean_phn_span
+ self.feats_extract = feats_extract
+ self.not_sequence = set(not_sequence)
+ self.attention_window = attention_window
+ self.seg_emb = seg_emb
+ self.text_masking = text_masking
- # convert each batch to paddle.Tensor
- text = paddle.to_tensor(text)
+ def __call__(self, data: Collection[Tuple[str, Dict[str, np.ndarray]]]
+ ) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
+ return mlm_collate_fn(
+ data,
+ feats_extract=self.feats_extract,
+ mlm_prob=self.mlm_prob,
+ mean_phn_span=self.mean_phn_span,
+ seg_emb=self.seg_emb,
+ text_masking=self.text_masking,
+ not_sequence=self.not_sequence)
+
+
+def mlm_collate_fn(
+ data: Collection[Tuple[str, Dict[str, np.ndarray]]],
+ feats_extract=None,
+ mlm_prob: float=0.8,
+ mean_phn_span: int=8,
+ seg_emb: bool=False,
+ text_masking: bool=False,
+ pad_value: int=0,
+ not_sequence: Collection[str]=(),
+) -> Tuple[List[str], Dict[str, paddle.Tensor]]:
+ uttids = [u for u, _ in data]
+ data = [d for _, d in data]
+
+ assert all(set(data[0]) == set(d) for d in data), "dict-keys mismatching"
+ assert all(not k.endswith("_lens")
+ for k in data[0]), f"*_lens is reserved: {list(data[0])}"
+
+ output = {}
+ for key in data[0]:
+
+ array_list = [d[key] for d in data]
+
+ # Assume the first axis is length:
+ # tensor_list: Batch x (Length, ...)
+ tensor_list = [paddle.to_tensor(a) for a in array_list]
+ # tensor: (Batch, Length, ...)
+ tensor = pad_list(tensor_list, pad_value)
+ output[key] = tensor
+
+ # lens: (Batch,)
+ if key not in not_sequence:
+ lens = paddle.to_tensor(
+ [d[key].shape[0] for d in data], dtype=paddle.int64)
+ output[key + "_lens"] = lens
+
+ feats = feats_extract.get_log_mel_fbank(np.array(output["speech"][0]))
feats = paddle.to_tensor(feats)
- text_lengths = paddle.to_tensor(text_lengths)
- feats_lengths = paddle.to_tensor(feats_lengths)
+ print("feats.shape:", feats.shape)
+ feats_lens = paddle.shape(feats)[0]
+ feats = paddle.unsqueeze(feats, 0)
- batch = {
- "text": text,
- "text_lengths": text_lengths,
- "feats": feats,
- "feats_lengths": feats_lengths,
- "speech": speech
- }
- # spk_emb has a higher priority than spk_id
- if "spk_emb" in examples[0]:
- spk_emb = [
- np.array(item["spk_emb"], dtype=np.float32) for item in examples
- ]
- spk_emb = batch_sequences(spk_emb)
- spk_emb = paddle.to_tensor(spk_emb)
- batch["spk_emb"] = spk_emb
- elif "spk_id" in examples[0]:
- spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
- spk_id = paddle.to_tensor(spk_id)
- batch["spk_id"] = spk_id
- return batch
+ text = output["text"]
+ text_lens = output["text_lens"]
+ align_start = output["align_start"]
+ align_start_lens = output["align_start_lens"]
+ align_end = output["align_end"]
+
+ max_tlen = max(text_lens)
+ max_slen = max(feats_lens)
+
+ speech_pad = feats[:, :max_slen]
+
+ text_pad = text
+ text_mask = make_non_pad_mask(
+ text_lens, text_pad, length_dim=1).unsqueeze(-2)
+ speech_mask = make_non_pad_mask(
+ feats_lens, speech_pad[:, :, 0], length_dim=1).unsqueeze(-2)
+
+ span_bdy = None
+ if 'span_bdy' in output.keys():
+ span_bdy = output['span_bdy']
+
+ # dual_mask 的是混合中英时候同时 mask 语音和文本
+ # ernie sat 在实现跨语言的时候都 mask 了
+ if text_masking:
+ masked_pos, text_masked_pos = phones_text_masking(
+ xs_pad=speech_pad,
+ src_mask=speech_mask,
+ text_pad=text_pad,
+ text_mask=text_mask,
+ align_start=align_start,
+ align_end=align_end,
+ align_start_lens=align_start_lens,
+ mlm_prob=mlm_prob,
+ mean_phn_span=mean_phn_span,
+ span_bdy=span_bdy)
+ # 训练纯中文和纯英文的 -> a3t 没有对 phoneme 做 mask, 只对语音 mask 了
+ # a3t 和 ernie sat 的区别主要在于做 mask 的时候
+ else:
+ masked_pos = phones_masking(
+ xs_pad=speech_pad,
+ src_mask=speech_mask,
+ align_start=align_start,
+ align_end=align_end,
+ align_start_lens=align_start_lens,
+ mlm_prob=mlm_prob,
+ mean_phn_span=mean_phn_span,
+ span_bdy=span_bdy)
+ text_masked_pos = paddle.zeros(paddle.shape(text_pad))
+
+ output_dict = {}
+
+ speech_seg_pos, text_seg_pos = get_seg_pos(
+ speech_pad=speech_pad,
+ text_pad=text_pad,
+ align_start=align_start,
+ align_end=align_end,
+ align_start_lens=align_start_lens,
+ seg_emb=seg_emb)
+ output_dict['speech'] = speech_pad
+ output_dict['text'] = text_pad
+ output_dict['masked_pos'] = masked_pos
+ output_dict['text_masked_pos'] = text_masked_pos
+ output_dict['speech_mask'] = speech_mask
+ output_dict['text_mask'] = text_mask
+ output_dict['speech_seg_pos'] = speech_seg_pos
+ output_dict['text_seg_pos'] = text_seg_pos
+ output = (uttids, output_dict)
+ return output
+
+
+def build_mlm_collate_fn(
+ sr: int=24000,
+ n_fft: int=2048,
+ hop_length: int=300,
+ win_length: int=None,
+ n_mels: int=80,
+ fmin: int=80,
+ fmax: int=7600,
+ mlm_prob: float=0.8,
+ mean_phn_span: int=8,
+ seg_emb: bool=False,
+ epoch: int=-1, ):
+ feats_extract_class = LogMelFBank
+
+ feats_extract = feats_extract_class(
+ sr=sr,
+ n_fft=n_fft,
+ hop_length=hop_length,
+ win_length=win_length,
+ n_mels=n_mels,
+ fmin=fmin,
+ fmax=fmax)
+
+ if epoch == -1:
+ mlm_prob_factor = 1
+ else:
+ mlm_prob_factor = 0.8
+
+ return MLMCollateFn(
+ feats_extract=feats_extract,
+ mlm_prob=mlm_prob * mlm_prob_factor,
+ mean_phn_span=mean_phn_span,
+ seg_emb=seg_emb)
diff --git a/paddlespeech/t2s/datasets/sampler.py b/paddlespeech/t2s/datasets/sampler.py
index cbc9764c5..a69bc8600 100644
--- a/paddlespeech/t2s/datasets/sampler.py
+++ b/paddlespeech/t2s/datasets/sampler.py
@@ -1,9 +1,8 @@
+import paddle
import math
-
import numpy as np
from paddle.io import BatchSampler
-
class ErnieSATSampler(BatchSampler):
"""Sampler that restricts data loading to a subset of the dataset.
In such case, each process can pass a DistributedBatchSampler instance
@@ -71,7 +70,7 @@ class ErnieSATSampler(BatchSampler):
assert isinstance(drop_last, bool), \
"drop_last should be a boolean number"
- from paddle.distributed import ParallelEnv
+ from paddle.fluid.dygraph.parallel import ParallelEnv
if num_replicas is not None:
assert isinstance(num_replicas, int) and num_replicas > 0, \
@@ -111,8 +110,8 @@ class ErnieSATSampler(BatchSampler):
subsampled_indices.extend(indices[i:i + self.batch_size])
indices = indices[len(indices) - last_batch_size:]
- subsampled_indices.extend(
- indices[self.local_rank * last_local_batch_size:(
+ subsampled_indices.extend(indices[
+ self.local_rank * last_local_batch_size:(
self.local_rank + 1) * last_local_batch_size])
return subsampled_indices
diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py
index 464f51a3b..529a8221c 100755
--- a/paddlespeech/t2s/exps/ernie_sat/align.py
+++ b/paddlespeech/t2s/exps/ernie_sat/align.py
@@ -19,9 +19,9 @@ import librosa
import numpy as np
import pypinyin
from praatio import textgrid
-
-from paddlespeech.t2s.exps.ernie_sat.utils import get_dict
from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
+from paddlespeech.t2s.exps.ernie_sat.utils import get_dict
+
DICT_EN = 'tools/aligner/cmudict-0.7b'
DICT_ZH = 'tools/aligner/simple.lexicon'
@@ -30,7 +30,6 @@ MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
MFA_PATH = 'tools/montreal-forced-aligner/bin'
os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
-
def _get_max_idx(dic):
return sorted([int(key.split('_')[0]) for key in dic.keys()])[-1]
@@ -107,11 +106,11 @@ def alignment(wav_path: str,
wav_name = os.path.basename(wav_path)
utt = wav_name.split('.')[0]
# prepare data for MFA
- tmp_name = get_tmp_name(text=text)
+ tmp_name = get_tmp_name(text=text)
tmpbase = './tmp_dir/' + tmp_name
tmpbase = Path(tmpbase)
tmpbase.mkdir(parents=True, exist_ok=True)
- print("tmp_name in alignment:", tmp_name)
+ print("tmp_name in alignment:",tmp_name)
shutil.copyfile(wav_path, tmpbase / wav_name)
txt_name = utt + '.txt'
@@ -341,7 +340,7 @@ def get_phns_spans(wav_path: str,
if __name__ == '__main__':
text = "For that reason cover should not be given."
- phn, dur, word2phns = alignment("source/p243_313.wav", text, lang='en')
+ phn, dur, word2phns = alignment("exp/p243_313.wav", text, lang='en')
print(phn, dur)
print(word2phns)
print("---------------------------------")
@@ -353,7 +352,7 @@ if __name__ == '__main__':
style=pypinyin.Style.TONE3,
tone_sandhi=True)
text_zh = " ".join(text_zh)
- phn, dur, word2phns = alignment("source/000001.wav", text_zh, lang='zh')
+ phn, dur, word2phns = alignment("exp/000001.wav", text_zh, lang='zh')
print(phn, dur)
print(word2phns)
print("---------------------------------")
@@ -368,7 +367,7 @@ if __name__ == '__main__':
print("---------------------------------")
outs = get_phns_spans(
- wav_path="source/p243_313.wav",
+ wav_path="exp/p243_313.wav",
old_str="For that reason cover should not be given.",
new_str="for that reason cover is impossible to be given.")
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index 21c9ae044..95b07367c 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -11,41 +11,35 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import argparse
-import os
-from pathlib import Path
-from typing import List
-
import librosa
import numpy as np
-import paddle
-import pypinyin
import soundfile as sf
-import yaml
-from pypinyin_dict.phrase_pinyin_data import large_pinyin
-from yacs.config import CfgNode
-from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn
-from paddlespeech.t2s.datasets.get_feats import LogMelFBank
from paddlespeech.t2s.exps.ernie_sat.align import get_phns_spans
from paddlespeech.t2s.exps.ernie_sat.utils import eval_durs
from paddlespeech.t2s.exps.ernie_sat.utils import get_dur_adj_factor
from paddlespeech.t2s.exps.ernie_sat.utils import get_span_bdy
-from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
-from paddlespeech.t2s.exps.syn_utils import get_am_inference
-from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.datasets.am_batch_fn import build_erniesat_collate_fn
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
from paddlespeech.t2s.exps.syn_utils import norm
-from paddlespeech.t2s.utils import str2bool
-large_pinyin.load()
+from paddlespeech.t2s.exps.ernie_sat.utils import get_tmp_name
-def _p2id(phonemes: List[str]) -> np.ndarray:
+
+
+
+
+def _p2id(self, phonemes: List[str]) -> np.ndarray:
# replace unk phone with sp
- phonemes = [phn if phn in vocab_phones else "sp" for phn in phonemes]
+ phonemes = [
+ phn if phn in vocab_phones else "sp" for phn in phonemes
+ ]
phone_ids = [vocab_phones[item] for item in phonemes]
return np.array(phone_ids, np.int64)
+
def prep_feats_with_dur(wav_path: str,
old_str: str='',
new_str: str='',
@@ -73,12 +67,12 @@ def prep_feats_with_dur(wav_path: str,
fs=fs,
n_shift=n_shift)
- mfa_start = phns_spans_outs['mfa_start']
- mfa_end = phns_spans_outs['mfa_end']
- old_phns = phns_spans_outs['old_phns']
- new_phns = phns_spans_outs['new_phns']
- span_to_repl = phns_spans_outs['span_to_repl']
- span_to_add = phns_spans_outs['span_to_add']
+ mfa_start = phns_spans_outs["mfa_start"]
+ mfa_end = phns_spans_outs["mfa_end"]
+ old_phns = phns_spans_outs["old_phns"]
+ new_phns = phns_spans_outs["new_phns"]
+ span_to_repl = phns_spans_outs["span_to_repl"]
+ span_to_add = phns_spans_outs["span_to_add"]
# 中文的 phns 不一定都在 fastspeech2 的字典里, 用 sp 代替
if target_lang in {'en', 'zh'}:
@@ -138,7 +132,7 @@ def prep_feats_with_dur(wav_path: str,
[wav_org[:wav_left_idx], blank_wav, wav_org[wav_right_idx:]])
# 音频是正常遮住了
- sf.write(str("mask_wav.wav"), new_wav, samplerate=fs)
+ sf.write(str("new_wav.wav"), new_wav, samplerate=fs)
# 4. get old and new mel span to be mask
old_span_bdy = get_span_bdy(
@@ -158,6 +152,8 @@ def prep_feats_with_dur(wav_path: str,
return outs
+
+
def prep_feats(wav_path: str,
old_str: str='',
new_str: str='',
@@ -167,7 +163,7 @@ def prep_feats(wav_path: str,
fs: int=24000,
n_shift: int=300):
- with_dur_outs = prep_feats_with_dur(
+ outs = prep_feats_with_dur(
wav_path=wav_path,
old_str=old_str,
new_str=new_str,
@@ -180,240 +176,138 @@ def prep_feats(wav_path: str,
wav_name = os.path.basename(wav_path)
utt_id = wav_name.split('.')[0]
- wav = with_dur_outs['new_wav']
- phns = with_dur_outs['new_phns']
- mfa_start = with_dur_outs['new_mfa_start']
- mfa_end = with_dur_outs['new_mfa_end']
- old_span_bdy = with_dur_outs['old_span_bdy']
- new_span_bdy = with_dur_outs['new_span_bdy']
+ wav = outs['new_wav']
+ phns = outs['new_phns']
+ mfa_start = outs['new_mfa_start']
+ mfa_end = outs['new_mfa_end']
+ old_span_bdy = outs['old_span_bdy']
+ new_span_bdy = outs['new_span_bdy']
span_bdy = np.array(new_span_bdy)
+ text = _p2id(phns)
mel = mel_extractor.get_log_mel_fbank(wav)
erniesat_mean, erniesat_std = np.load(erniesat_stat)
normed_mel = norm(mel, erniesat_mean, erniesat_std)
- tmp_name = get_tmp_name(text=old_str)
+ tmp_name = get_tmp_name(text=old_str)
tmpbase = './tmp_dir/' + tmp_name
tmpbase = Path(tmpbase)
tmpbase.mkdir(parents=True, exist_ok=True)
+ print("tmp_name in synthesize_e2e:",tmp_name)
mel_path = tmpbase / 'mel.npy'
- np.save(mel_path, normed_mel)
+ print("mel_path:",mel_path)
+ np.save(mel_path, logmel)
durations = [e - s for e, s in zip(mfa_end, mfa_start)]
- text = _p2id(phns)
- datum = {
- "utt_id": utt_id,
- "spk_id": 0,
- "text": text,
- "text_lengths": len(text),
- "speech_lengths": len(normed_mel),
- "durations": durations,
- "speech": np.load(mel_path),
- "align_start": mfa_start,
+ datum={
+ "utt_id": utt_id,
+ "spk_id": 0,
+ "text": text,
+ "text_lengths": len(text),
+ "speech_lengths": 115,
+ "durations": durations,
+ "speech": mel_path,
+ "align_start": mfa_start,
"align_end": mfa_end,
"span_bdy": span_bdy
}
batch = collate_fn([datum])
- outs = dict()
- outs['batch'] = batch
- outs['old_span_bdy'] = old_span_bdy
- outs['new_span_bdy'] = new_span_bdy
- return outs
-
-
-def get_mlm_output(wav_path: str,
- old_str: str='',
- new_str: str='',
- source_lang: str='en',
- target_lang: str='en',
- duration_adjust: bool=True,
- fs: int=24000,
- n_shift: int=300):
-
- prep_feats_outs = prep_feats(
+ print("batch:",batch)
+
+ return batch, old_span_bdy, new_span_bdy
+
+
+def decode_with_model(mlm_model: nn.Layer,
+ collate_fn,
+ wav_path: str,
+ old_str: str='',
+ new_str: str='',
+ source_lang: str='en',
+ target_lang: str='en',
+ use_teacher_forcing: bool=False,
+ duration_adjust: bool=True,
+ fs: int=24000,
+ n_shift: int=300,
+ token_list: List[str]=[]):
+ batch, old_span_bdy, new_span_bdy = prep_feats(
+ source_lang=source_lang,
+ target_lang=target_lang,
wav_path=wav_path,
old_str=old_str,
new_str=new_str,
- source_lang=source_lang,
- target_lang=target_lang,
duration_adjust=duration_adjust,
fs=fs,
- n_shift=n_shift)
+ n_shift=n_shift,
+ token_list=token_list)
+
- batch = prep_feats_outs['batch']
- new_span_bdy = prep_feats_outs['new_span_bdy']
- old_span_bdy = prep_feats_outs['old_span_bdy']
- out_mels = erniesat_inference(
- speech=batch['speech'],
- text=batch['text'],
- masked_pos=batch['masked_pos'],
- speech_mask=batch['speech_mask'],
- text_mask=batch['text_mask'],
- speech_seg_pos=batch['speech_seg_pos'],
- text_seg_pos=batch['text_seg_pos'],
- span_bdy=new_span_bdy)
+ feats = collate_fn(batch)[1]
+
+ if 'text_masked_pos' in feats.keys():
+ feats.pop('text_masked_pos')
+
+ output = mlm_model.inference(
+ text=feats['text'],
+ speech=feats['speech'],
+ masked_pos=feats['masked_pos'],
+ speech_mask=feats['speech_mask'],
+ text_mask=feats['text_mask'],
+ speech_seg_pos=feats['speech_seg_pos'],
+ text_seg_pos=feats['text_seg_pos'],
+ span_bdy=new_span_bdy,
+ use_teacher_forcing=use_teacher_forcing)
# 拼接音频
- output_feat = paddle.concat(x=out_mels, axis=0)
+ output_feat = paddle.concat(x=output, axis=0)
wav_org, _ = librosa.load(wav_path, sr=fs)
- outs = dict()
- outs['wav_org'] = wav_org
- outs['output_feat'] = output_feat
- outs['old_span_bdy'] = old_span_bdy
- outs['new_span_bdy'] = new_span_bdy
-
- return outs
+ return wav_org, output_feat, old_span_bdy, new_span_bdy, fs, hop_length
-def get_wav(wav_path: str,
- source_lang: str='en',
- target_lang: str='en',
- old_str: str='',
- new_str: str='',
- duration_adjust: bool=True,
- fs: int=24000,
- n_shift: int=300):
+if __name__ == '__main__':
+ fs = 24000
+ n_shift = 300
+ wav_path = "exp/p243_313.wav"
+ old_str = "For that reason cover should not be given."
+ # for edit
+ # new_str = "for that reason cover is impossible to be given."
+ # for synthesize
+ append_str = "do you love me i love you so much"
+ new_str = old_str + append_str
- outs = get_mlm_output(
+ '''
+ outs = prep_feats_with_dur(
wav_path=wav_path,
old_str=old_str,
new_str=new_str,
- source_lang=source_lang,
- target_lang=target_lang,
- duration_adjust=duration_adjust,
fs=fs,
n_shift=n_shift)
- wav_org = outs['wav_org']
- output_feat = outs['output_feat']
+ new_wav = outs['new_wav']
+ new_phns = outs['new_phns']
+ new_mfa_start = outs['new_mfa_start']
+ new_mfa_end = outs['new_mfa_end']
old_span_bdy = outs['old_span_bdy']
new_span_bdy = outs['new_span_bdy']
- masked_feat = output_feat[new_span_bdy[0]:new_span_bdy[1]]
-
- with paddle.no_grad():
- alt_wav = voc_inference(masked_feat)
- alt_wav = np.squeeze(alt_wav)
-
- old_time_bdy = [n_shift * x for x in old_span_bdy]
- wav_replaced = np.concatenate(
- [wav_org[:old_time_bdy[0]], alt_wav, wav_org[old_time_bdy[1]:]])
-
- wav_dict = {"origin": wav_org, "output": wav_replaced}
- return wav_dict
-
-
-def parse_args():
- # parse args and config
- parser = argparse.ArgumentParser(
- description="Synthesize with acoustic model & vocoder")
- # ernie sat
-
- parser.add_argument(
- '--erniesat_config',
- type=str,
- default=None,
- help='Config of acoustic model.')
- parser.add_argument(
- '--erniesat_ckpt',
- type=str,
- default=None,
- help='Checkpoint file of acoustic model.')
- parser.add_argument(
- "--erniesat_stat",
- type=str,
- default=None,
- help="mean and standard deviation used to normalize spectrogram when training acoustic model."
- )
- parser.add_argument(
- "--phones_dict", type=str, default=None, help="phone vocabulary file.")
- # vocoder
- parser.add_argument(
- '--voc',
- type=str,
- default='pwgan_csmsc',
- choices=[
- 'pwgan_aishell3',
- 'pwgan_vctk',
- 'hifigan_aishell3',
- 'hifigan_vctk',
- ],
- help='Choose vocoder type of tts task.')
- parser.add_argument(
- '--voc_config', type=str, default=None, help='Config of voc.')
- parser.add_argument(
- '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
- parser.add_argument(
- "--voc_stat",
- type=str,
- default=None,
- help="mean and standard deviation used to normalize spectrogram when training voc."
- )
- # other
- parser.add_argument(
- "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
-
- # ernie sat related
- parser.add_argument("--task_name", type=str, help="task name")
- parser.add_argument("--wav_path", type=str, help="path of old wav")
- parser.add_argument("--old_str", type=str, help="old string")
- parser.add_argument("--new_str", type=str, help="new string")
- parser.add_argument(
- "--source_lang", type=str, default="en", help="source language")
- parser.add_argument(
- "--target_lang", type=str, default="en", help="target language")
- parser.add_argument(
- "--duration_adjust",
- type=str2bool,
- default=True,
- help="whether to adjust duration.")
- parser.add_argument("--output_name", type=str, default="output.wav")
-
- args = parser.parse_args()
- return args
-
+ print("---------------------------------")
-if __name__ == '__main__':
- args = parse_args()
+ print("new_wav:", new_wav)
+ print("new_phns:", new_phns)
+ print("new_mfa_start:", new_mfa_start)
+ print("new_mfa_end:", new_mfa_end)
+ print("old_span_bdy:", old_span_bdy)
+ print("new_span_bdy:", new_span_bdy)
+ print("---------------------------------")
+ '''
- if args.ngpu == 0:
- paddle.set_device("cpu")
- elif args.ngpu > 0:
- paddle.set_device("gpu")
- else:
- print("ngpu should >= 0 !")
+ erniesat_config = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/local/default.yaml"
- # evaluate(args)
- with open(args.erniesat_config) as f:
+ with open(erniesat_config) as f:
erniesat_config = CfgNode(yaml.safe_load(f))
- old_str = args.old_str
- new_str = args.new_str
-
- # convert Chinese characters to pinyin
- if args.source_lang == 'zh':
- old_str = pypinyin.lazy_pinyin(
- old_str,
- neutral_tone_with_five=True,
- style=pypinyin.Style.TONE3,
- tone_sandhi=True)
- old_str = ' '.join(old_str)
- if args.target_lang == 'zh':
- new_str = pypinyin.lazy_pinyin(
- new_str,
- neutral_tone_with_five=True,
- style=pypinyin.Style.TONE3,
- tone_sandhi=True)
- new_str = ' '.join(new_str)
-
- if args.task_name == 'edit':
- new_str = new_str
- elif args.task_name == 'synthesize':
- new_str = old_str + new_str
- else:
- new_str = old_str + new_str
- print("new_str:", new_str)
+
+ erniesat_stat = "/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/train/speech_stats.npy"
# Extractor
mel_extractor = LogMelFBank(
@@ -425,51 +319,28 @@ if __name__ == '__main__':
n_mels=erniesat_config.n_mels,
fmin=erniesat_config.fmin,
fmax=erniesat_config.fmax)
+
+
collate_fn = build_erniesat_collate_fn(
mlm_prob=erniesat_config.mlm_prob,
mean_phn_span=erniesat_config.mean_phn_span,
seg_emb=erniesat_config.model['enc_input_layer'] == 'sega_mlm',
text_masking=False)
-
+
+ phones_dict='/home/yuantian01/PaddleSpeech_ERNIE_SAT/PaddleSpeech/examples/vctk/ernie_sat/dump/phone_id_map.txt'
vocab_phones = {}
- with open(args.phones_dict, 'rt') as f:
+ with open(phones_dict, 'rt') as f:
phn_id = [line.strip().split() for line in f.readlines()]
for phn, id in phn_id:
vocab_phones[phn] = int(id)
- # ernie sat model
- erniesat_inference = get_am_inference(
- am='erniesat_dataset',
- am_config=erniesat_config,
- am_ckpt=args.erniesat_ckpt,
- am_stat=args.erniesat_stat,
- phones_dict=args.phones_dict)
-
- with open(args.voc_config) as f:
- voc_config = CfgNode(yaml.safe_load(f))
-
- # vocoder
- voc_inference = get_voc_inference(
- voc=args.voc,
- voc_config=voc_config,
- voc_ckpt=args.voc_ckpt,
- voc_stat=args.voc_stat)
-
- erniesat_stat = args.erniesat_stat
-
- wav_dict = get_wav(
- wav_path=args.wav_path,
- source_lang=args.source_lang,
- target_lang=args.target_lang,
- old_str=old_str,
- new_str=new_str,
- duration_adjust=args.duration_adjust,
- fs=erniesat_config.fs,
- n_shift=erniesat_config.n_shift)
-
- sf.write(
- args.output_name, wav_dict['output'], samplerate=erniesat_config.fs)
- print(
- f"\033[1;32;m Generated audio saved into {args.output_name} ! \033[0m")
+ prep_feats(wav_path=wav_path,
+ old_str=old_str,
+ new_str=new_str,
+ fs=fs,
+ n_shift=n_shift)
+
+
+
diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py
index 75a666bb1..af653ef89 100644
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@@ -25,6 +25,7 @@ from paddle import DataParallel
from paddle import distributed as dist
from paddle import nn
from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
from paddle.optimizer import Adam
from yacs.config import CfgNode
diff --git a/paddlespeech/t2s/exps/ernie_sat/utils.py b/paddlespeech/t2s/exps/ernie_sat/utils.py
index 6805e513c..9169efa36 100644
--- a/paddlespeech/t2s/exps/ernie_sat/utils.py
+++ b/paddlespeech/t2s/exps/ernie_sat/utils.py
@@ -11,35 +11,32 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import hashlib
-import os
from pathlib import Path
from typing import Dict
from typing import List
from typing import Union
+import os
import numpy as np
import paddle
import yaml
from yacs.config import CfgNode
+import hashlib
+
from paddlespeech.t2s.exps.syn_utils import get_am_inference
from paddlespeech.t2s.exps.syn_utils import get_voc_inference
-
def _get_user():
return os.path.expanduser('~').split('/')[-1]
-
def str2md5(string):
md5_val = hashlib.md5(string.encode('utf8')).hexdigest()
return md5_val
-
-def get_tmp_name(text: str):
+def get_tmp_name(text:str):
return _get_user() + '_' + str(os.getpid()) + '_' + str2md5(text)
-
def get_dict(dictfile: str):
word2phns_dict = {}
with open(dictfile, 'r') as fid:
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 15d8dfb78..127e1a3ba 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -82,10 +82,6 @@ def denorm(data, mean, std):
return data * std + mean
-def norm(data, mean, std):
- return (data - mean) / std
-
-
def get_chunks(data, block_size: int, pad_size: int):
data_len = data.shape[1]
chunks = []
@@ -298,8 +294,8 @@ def am_to_static(am_inference,
am_name = am[:am.rindex('_')]
am_dataset = am[am.rindex('_') + 1:]
if am_name == 'fastspeech2':
- if am_dataset in {"aishell3", "vctk",
- "mix"} and speaker_dict is not None:
+ if am_dataset in {"aishell3", "vctk", "mix"
+ } and speaker_dict is not None:
am_inference = jit.to_static(
am_inference,
input_spec=[
@@ -311,8 +307,8 @@ def am_to_static(am_inference,
am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
elif am_name == 'speedyspeech':
- if am_dataset in {"aishell3", "vctk",
- "mix"} and speaker_dict is not None:
+ if am_dataset in {"aishell3", "vctk", "mix"
+ } and speaker_dict is not None:
am_inference = jit.to_static(
am_inference,
input_spec=[
diff --git a/paddlespeech/t2s/exps/vits/synthesize.py b/paddlespeech/t2s/exps/vits/synthesize.py
index 968684b25..074b890f9 100644
--- a/paddlespeech/t2s/exps/vits/synthesize.py
+++ b/paddlespeech/t2s/exps/vits/synthesize.py
@@ -15,7 +15,6 @@ import argparse
from pathlib import Path
import jsonlines
-import numpy as np
import paddle
import soundfile as sf
import yaml
@@ -24,7 +23,6 @@ from yacs.config import CfgNode
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.vits import VITS
-from paddlespeech.t2s.utils import str2bool
def evaluate(args):
@@ -42,26 +40,8 @@ def evaluate(args):
print(config)
fields = ["utt_id", "text"]
- converters = {}
-
- spk_num = None
- if args.speaker_dict is not None:
- print("multiple speaker vits!")
- with open(args.speaker_dict, 'rt') as f:
- spk_id = [line.strip().split() for line in f.readlines()]
- spk_num = len(spk_id)
- fields += ["spk_id"]
- elif args.voice_cloning:
- print("Evaluating voice cloning!")
- fields += ["spk_emb"]
- else:
- print("single speaker vits!")
- print("spk_num:", spk_num)
- test_dataset = DataTable(
- data=test_metadata,
- fields=fields,
- converters=converters, )
+ test_dataset = DataTable(data=test_metadata, fields=fields)
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
@@ -69,7 +49,6 @@ def evaluate(args):
print("vocab_size:", vocab_size)
odim = config.n_fft // 2 + 1
- config["model"]["generator_params"]["spks"] = spk_num
vits = VITS(idim=vocab_size, odim=odim, **config["model"])
vits.set_state_dict(paddle.load(args.ckpt)["main_params"])
@@ -86,15 +65,7 @@ def evaluate(args):
phone_ids = paddle.to_tensor(datum["text"])
with timer() as t:
with paddle.no_grad():
- spk_emb = None
- spk_id = None
- # multi speaker
- if args.voice_cloning and "spk_emb" in datum:
- spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
- elif "spk_id" in datum:
- spk_id = paddle.to_tensor(datum["spk_id"])
- out = vits.inference(
- text=phone_ids, sids=spk_id, spembs=spk_emb)
+ out = vits.inference(text=phone_ids)
wav = out["wav"]
wav = wav.numpy()
N += wav.size
@@ -119,13 +90,6 @@ def parse_args():
'--ckpt', type=str, default=None, help='Checkpoint file of VITS.')
parser.add_argument(
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
- parser.add_argument(
- "--speaker_dict", type=str, default=None, help="speaker id map file.")
- parser.add_argument(
- "--voice-cloning",
- type=str2bool,
- default=False,
- help="whether training voice cloning model.")
# other
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
diff --git a/paddlespeech/t2s/exps/vits/synthesize_e2e.py b/paddlespeech/t2s/exps/vits/synthesize_e2e.py
index f9d10ea62..33a413751 100644
--- a/paddlespeech/t2s/exps/vits/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/vits/synthesize_e2e.py
@@ -42,23 +42,12 @@ def evaluate(args):
# frontend
frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
- spk_num = None
- if args.speaker_dict is not None:
- print("multiple speaker vits!")
- with open(args.speaker_dict, 'rt') as f:
- spk_id = [line.strip().split() for line in f.readlines()]
- spk_num = len(spk_id)
- else:
- print("single speaker vits!")
- print("spk_num:", spk_num)
-
with open(args.phones_dict, "r") as f:
phn_id = [line.strip().split() for line in f.readlines()]
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)
odim = config.n_fft // 2 + 1
- config["model"]["generator_params"]["spks"] = spk_num
vits = VITS(idim=vocab_size, odim=odim, **config["model"])
vits.set_state_dict(paddle.load(args.ckpt)["main_params"])
@@ -89,10 +78,7 @@ def evaluate(args):
flags = 0
for i in range(len(phone_ids)):
part_phone_ids = phone_ids[i]
- spk_id = None
- if spk_num is not None:
- spk_id = paddle.to_tensor(args.spk_id)
- out = vits.inference(text=part_phone_ids, sids=spk_id)
+ out = vits.inference(text=part_phone_ids)
wav = out["wav"]
if flags == 0:
wav_all = wav
@@ -123,13 +109,6 @@ def parse_args():
'--ckpt', type=str, default=None, help='Checkpoint file of VITS.')
parser.add_argument(
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
- parser.add_argument(
- "--speaker_dict", type=str, default=None, help="speaker id map file.")
- parser.add_argument(
- '--spk_id',
- type=int,
- default=0,
- help='spk id for multi speaker acoustic model')
# other
parser.add_argument(
'--lang',
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index c994faa5a..1a68d1326 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -28,7 +28,6 @@ from paddle.io import DistributedBatchSampler
from paddle.optimizer import Adam
from yacs.config import CfgNode
-from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn
from paddlespeech.t2s.datasets.am_batch_fn import vits_single_spk_batch_fn
from paddlespeech.t2s.datasets.data_table import DataTable
from paddlespeech.t2s.models.vits import VITS
@@ -44,7 +43,6 @@ from paddlespeech.t2s.training.extensions.visualizer import VisualDL
from paddlespeech.t2s.training.optimizer import scheduler_classes
from paddlespeech.t2s.training.seeding import seed_everything
from paddlespeech.t2s.training.trainer import Trainer
-from paddlespeech.t2s.utils import str2bool
def train_sp(args, config):
@@ -74,23 +72,6 @@ def train_sp(args, config):
"wave": np.load,
"feats": np.load,
}
- spk_num = None
- if args.speaker_dict is not None:
- print("multiple speaker vits!")
- collate_fn = vits_multi_spk_batch_fn
- with open(args.speaker_dict, 'rt') as f:
- spk_id = [line.strip().split() for line in f.readlines()]
- spk_num = len(spk_id)
- fields += ["spk_id"]
- elif args.voice_cloning:
- print("Training voice cloning!")
- collate_fn = vits_multi_spk_batch_fn
- fields += ["spk_emb"]
- converters["spk_emb"] = np.load
- else:
- print("single speaker vits!")
- collate_fn = vits_single_spk_batch_fn
- print("spk_num:", spk_num)
# construct dataset for training and validation
with jsonlines.open(args.train_metadata, 'r') as reader:
@@ -119,16 +100,18 @@ def train_sp(args, config):
drop_last=False)
print("samplers done!")
+ train_batch_fn = vits_single_spk_batch_fn
+
train_dataloader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
- collate_fn=collate_fn,
+ collate_fn=train_batch_fn,
num_workers=config.num_workers)
dev_dataloader = DataLoader(
dev_dataset,
batch_sampler=dev_sampler,
- collate_fn=collate_fn,
+ collate_fn=train_batch_fn,
num_workers=config.num_workers)
print("dataloaders done!")
@@ -138,7 +121,6 @@ def train_sp(args, config):
print("vocab_size:", vocab_size)
odim = config.n_fft // 2 + 1
- config["model"]["generator_params"]["spks"] = spk_num
model = VITS(idim=vocab_size, odim=odim, **config["model"])
gen_parameters = model.generator.parameters()
dis_parameters = model.discriminator.parameters()
@@ -258,17 +240,6 @@ def main():
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
parser.add_argument(
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
- parser.add_argument(
- "--speaker-dict",
- type=str,
- default=None,
- help="speaker id map file for multiple speaker model.")
-
- parser.add_argument(
- "--voice-cloning",
- type=str2bool,
- default=False,
- help="whether training voice cloning model.")
args = parser.parse_args()
diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py
index 80cfea4a6..b51a4d7bc 100644
--- a/paddlespeech/t2s/exps/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@@ -21,28 +21,13 @@ import soundfile as sf
import yaml
from yacs.config import CfgNode
-from paddlespeech.cli.vector import VectorExecutor
from paddlespeech.t2s.exps.syn_utils import get_am_inference
from paddlespeech.t2s.exps.syn_utils import get_voc_inference
from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.utils import str2bool
from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
-def gen_random_embed(use_ecapa: bool=False):
- if use_ecapa:
- # Randomly generate numbers of -25 ~ 25, 192 is the dim of spk_emb
- random_spk_emb = (-1 + 2 * np.random.rand(192)) * 25
-
- # GE2E
- else:
- # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
- random_spk_emb = np.random.rand(256) * 0.2
- random_spk_emb = paddle.to_tensor(random_spk_emb, dtype='float32')
- return random_spk_emb
-
-
def voice_cloning(args):
# Init body.
with open(args.am_config) as f:
@@ -56,47 +41,30 @@ def voice_cloning(args):
print(am_config)
print(voc_config)
- output_dir = Path(args.output_dir)
- output_dir.mkdir(parents=True, exist_ok=True)
-
- input_dir = Path(args.input_dir)
-
# speaker encoder
- if args.use_ecapa:
- vec_executor = VectorExecutor()
- # warm up
- vec_executor(
- audio_file=input_dir / os.listdir(input_dir)[0], force_yes=True)
- print("ECAPA-TDNN Done!")
- # use GE2E
- else:
- p = SpeakerVerificationPreprocessor(
- sampling_rate=16000,
- audio_norm_target_dBFS=-30,
- vad_window_length=30,
- vad_moving_average_width=8,
- vad_max_silence_length=6,
- mel_window_length=25,
- mel_window_step=10,
- n_mels=40,
- partial_n_frames=160,
- min_pad_coverage=0.75,
- partial_overlap_ratio=0.5)
- print("Audio Processor Done!")
-
- speaker_encoder = LSTMSpeakerEncoder(
- n_mels=40, num_layers=3, hidden_size=256, output_size=256)
- speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
- speaker_encoder.eval()
- print("GE2E Done!")
+ p = SpeakerVerificationPreprocessor(
+ sampling_rate=16000,
+ audio_norm_target_dBFS=-30,
+ vad_window_length=30,
+ vad_moving_average_width=8,
+ vad_max_silence_length=6,
+ mel_window_length=25,
+ mel_window_step=10,
+ n_mels=40,
+ partial_n_frames=160,
+ min_pad_coverage=0.75,
+ partial_overlap_ratio=0.5)
+ print("Audio Processor Done!")
+
+ speaker_encoder = LSTMSpeakerEncoder(
+ n_mels=40, num_layers=3, hidden_size=256, output_size=256)
+ speaker_encoder.set_state_dict(paddle.load(args.ge2e_params_path))
+ speaker_encoder.eval()
+ print("GE2E Done!")
frontend = Frontend(phone_vocab_path=args.phones_dict)
print("frontend done!")
- sentence = args.text
- input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
- phone_ids = input_ids["phone_ids"][0]
-
# acoustic model
am_inference = get_am_inference(
am=args.am,
@@ -112,19 +80,26 @@ def voice_cloning(args):
voc_ckpt=args.voc_ckpt,
voc_stat=args.voc_stat)
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ input_dir = Path(args.input_dir)
+
+ sentence = args.text
+
+ input_ids = frontend.get_input_ids(sentence, merge_sentences=True)
+ phone_ids = input_ids["phone_ids"][0]
+
for name in os.listdir(input_dir):
utt_id = name.split(".")[0]
ref_audio_path = input_dir / name
- if args.use_ecapa:
- spk_emb = vec_executor(audio_file=ref_audio_path, force_yes=True)
- spk_emb = paddle.to_tensor(spk_emb)
- # GE2E
- else:
- mel_sequences = p.extract_mel_partials(
- p.preprocess_wav(ref_audio_path))
- with paddle.no_grad():
- spk_emb = speaker_encoder.embed_utterance(
- paddle.to_tensor(mel_sequences))
+ mel_sequences = p.extract_mel_partials(p.preprocess_wav(ref_audio_path))
+ # print("mel_sequences: ", mel_sequences.shape)
+ with paddle.no_grad():
+ spk_emb = speaker_encoder.embed_utterance(
+ paddle.to_tensor(mel_sequences))
+ # print("spk_emb shape: ", spk_emb.shape)
+
with paddle.no_grad():
wav = voc_inference(am_inference(phone_ids, spk_emb=spk_emb))
@@ -133,17 +108,16 @@ def voice_cloning(args):
wav.numpy(),
samplerate=am_config.fs)
print(f"{utt_id} done!")
-
- # generate 5 random_spk_emb
- for i in range(5):
- random_spk_emb = gen_random_embed(args.use_ecapa)
- utt_id = "random_spk_emb"
- with paddle.no_grad():
- wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb))
- sf.write(
- str(output_dir / (utt_id + "_" + str(i) + ".wav")),
- wav.numpy(),
- samplerate=am_config.fs)
+ # Randomly generate numbers of 0 ~ 0.2, 256 is the dim of spk_emb
+ random_spk_emb = np.random.rand(256) * 0.2
+ random_spk_emb = paddle.to_tensor(random_spk_emb, dtype='float32')
+ utt_id = "random_spk_emb"
+ with paddle.no_grad():
+ wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb))
+ sf.write(
+ str(output_dir / (utt_id + ".wav")),
+ wav.numpy(),
+ samplerate=am_config.fs)
print(f"{utt_id} done!")
@@ -197,15 +171,13 @@ def parse_args():
type=str,
default="每当你觉得,想要批评什么人的时候,你切要记着,这个世界上的人,并非都具备你禀有的条件。",
help="text to synthesize, a line")
+
parser.add_argument(
"--ge2e_params_path", type=str, help="ge2e params path.")
- parser.add_argument(
- "--use_ecapa",
- type=str2bool,
- default=False,
- help="whether to use ECAPA-TDNN as speaker encoder.")
+
parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+
parser.add_argument(
"--input-dir",
type=str,
diff --git a/paddlespeech/t2s/frontend/g2pw/__init__.py b/paddlespeech/t2s/frontend/g2pw/__init__.py
index 0eaeee5df..6e1ee0db8 100644
--- a/paddlespeech/t2s/frontend/g2pw/__init__.py
+++ b/paddlespeech/t2s/frontend/g2pw/__init__.py
@@ -1 +1,2 @@
from paddlespeech.t2s.frontend.g2pw.onnx_api import G2PWOnnxConverter
+
diff --git a/paddlespeech/t2s/frontend/g2pw/dataset.py b/paddlespeech/t2s/frontend/g2pw/dataset.py
index 98af5f463..ab715dc36 100644
--- a/paddlespeech/t2s/frontend/g2pw/dataset.py
+++ b/paddlespeech/t2s/frontend/g2pw/dataset.py
@@ -81,12 +81,12 @@ def prepare_onnx_input(tokenizer,
position_ids.append(position_id)
outputs = {
- 'input_ids': np.array(input_ids).astype(np.int64),
- 'token_type_ids': np.array(token_type_ids).astype(np.int64),
- 'attention_masks': np.array(attention_masks).astype(np.int64),
+ 'input_ids': np.array(input_ids),
+ 'token_type_ids': np.array(token_type_ids),
+ 'attention_masks': np.array(attention_masks),
'phoneme_masks': np.array(phoneme_masks).astype(np.float32),
- 'char_ids': np.array(char_ids).astype(np.int64),
- 'position_ids': np.array(position_ids).astype(np.int64),
+ 'char_ids': np.array(char_ids),
+ 'position_ids': np.array(position_ids),
}
return outputs
diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
index 180e8ae15..9e708ec88 100644
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@@ -34,7 +34,7 @@ from paddlespeech.t2s.frontend.g2pw.utils import load_config
from paddlespeech.t2s.frontend.zh_normalization.char_convert import tranditional_to_simplified
from paddlespeech.utils.env import MODEL_HOME
-model_version = '1.1'
+model_version = '1.0'
def predict(session, onnx_input, labels):
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index 101a1e503..6868d3357 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -61,11 +61,7 @@ class MixFrontend():
return False
def is_end(self, before_char, after_char) -> bool:
- flag = 0
- for char in (before_char, after_char):
- if self.is_alphabet(char) or char == " ":
- flag += 1
- if flag == 2:
+ if ((self.is_alphabet(before_char) or before_char == " ") and (self.is_alphabet(after_char) or after_char == " ")):
return True
else:
return False
@@ -90,11 +86,10 @@ class MixFrontend():
if point_index == 0 or point_index == len(text) - 1:
new_text = text
else:
- if not self.is_end(text[point_index - 1], text[point_index +
- 1]):
+ if not self.is_end(text[point_index - 1], text[point_index + 1]):
new_text = text
else:
- new_text = text[:point_index] + "。" + text[point_index + 1:]
+ new_text = text[: point_index] + "。" + text[point_index + 1:]
elif len(point_indexs) == 2:
first_index = point_indexs[0]
@@ -102,8 +97,7 @@ class MixFrontend():
# first
if first_index != 0:
- if not self.is_end(text[first_index - 1], text[first_index +
- 1]):
+ if not self.is_end(text[first_index - 1], text[first_index + 1]):
new_text += (text[:first_index] + ".")
else:
new_text += (text[:first_index] + "。")
@@ -112,20 +106,18 @@ class MixFrontend():
# last
if end_index != len(text) - 1:
if not self.is_end(text[end_index - 1], text[end_index + 1]):
- new_text += text[point_indexs[-2] + 1:]
+ new_text += text[point_indexs[-2] + 1 : ]
else:
- new_text += (text[point_indexs[-2] + 1:end_index] + "。" +
- text[end_index + 1:])
+ new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ])
else:
- new_text += "."
+ new_text += "."
else:
first_index = point_indexs[0]
end_index = point_indexs[-1]
# first
if first_index != 0:
- if not self.is_end(text[first_index - 1], text[first_index +
- 1]):
+ if not self.is_end(text[first_index - 1], text[first_index + 1]):
new_text += (text[:first_index] + ".")
else:
new_text += (text[:first_index] + "。")
@@ -134,20 +126,16 @@ class MixFrontend():
# middle
for j in range(1, len(point_indexs) - 1):
point_index = point_indexs[j]
- if not self.is_end(text[point_index - 1], text[point_index +
- 1]):
- new_text += (
- text[point_indexs[j - 1] + 1:point_index] + ".")
+ if not self.is_end(text[point_index - 1], text[point_index + 1]):
+ new_text += (text[point_indexs[j-1] + 1 : point_index] + ".")
else:
- new_text += (
- text[point_indexs[j - 1] + 1:point_index] + "。")
+ new_text += (text[point_indexs[j-1] + 1 : point_index] + "。")
# last
if end_index != len(text) - 1:
if not self.is_end(text[end_index - 1], text[end_index + 1]):
- new_text += text[point_indexs[-2] + 1:]
+ new_text += text[point_indexs[-2] + 1 : ]
else:
- new_text += (text[point_indexs[-2] + 1:end_index] + "。" +
- text[end_index + 1:])
+ new_text += (text[point_indexs[-2] + 1 : end_index] + "。" + text[end_index + 1 : ])
else:
new_text += "."
@@ -236,7 +224,7 @@ class MixFrontend():
def get_input_ids(self,
sentence: str,
- merge_sentences: bool=False,
+ merge_sentences: bool=True,
get_tone_ids: bool=False,
add_sp: bool=True,
to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
@@ -244,29 +232,28 @@ class MixFrontend():
sentences = self._split(sentence)
phones_list = []
result = {}
+
for text in sentences:
phones_seg = []
segments = self._distinguish(text)
for seg in segments:
content = seg[0]
lang = seg[1]
- if content != '':
- if lang == "en":
- input_ids = self.en_frontend.get_input_ids(
- content, merge_sentences=True, to_tensor=to_tensor)
- else:
- input_ids = self.zh_frontend.get_input_ids(
- content,
- merge_sentences=True,
- get_tone_ids=get_tone_ids,
- to_tensor=to_tensor)
-
- phones_seg.append(input_ids["phone_ids"][0])
- if add_sp:
- phones_seg.append(self.sp_id_tensor)
-
- if phones_seg == []:
- phones_seg.append(self.sp_id_tensor)
+ if lang == "zh":
+ input_ids = self.zh_frontend.get_input_ids(
+ content,
+ merge_sentences=True,
+ get_tone_ids=get_tone_ids,
+ to_tensor=to_tensor)
+
+ elif lang == "en":
+ input_ids = self.en_frontend.get_input_ids(
+ content, merge_sentences=True, to_tensor=to_tensor)
+
+ phones_seg.append(input_ids["phone_ids"][0])
+ if add_sp:
+ phones_seg.append(self.sp_id_tensor)
+
phones = paddle.concat(phones_seg)
phones_list.append(phones)
diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml
index 51b76f23f..2c7cf33fb 100644
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -42,8 +42,3 @@ polyphonic:
咖喱: ['ga1','li5']
时分: ['shi2','fen1']
蚌埠: ['beng4','bu4']
- 驯服: ['xun4','fu2']
- 幸免于难: ['xing4','mian3','yu2','nan4']
- 恶行: ['e4','xing2']
- 唉: ['ai4']
-
diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 9fff4272c..e5ef617a9 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -42,7 +42,7 @@ class ToneSandhi():
'木头', '木匠', '朋友', '月饼', '月亮', '暖和', '明白', '时候', '新鲜', '故事', '收拾',
'收成', '提防', '挖苦', '挑剔', '指甲', '指头', '拾掇', '拳头', '拨弄', '招牌', '招呼',
'抬举', '护士', '折腾', '扫帚', '打量', '打算', '打扮', '打听', '打发', '扎实', '扁担',
- '戒指', '懒得', '意识', '意思', '悟性', '怪物', '思量', '怎么', '念头', '念叨', '别人',
+ '戒指', '懒得', '意识', '意思', '情形', '悟性', '怪物', '思量', '怎么', '念头', '念叨',
'快活', '忙活', '志气', '心思', '得罪', '张罗', '弟兄', '开通', '应酬', '庄稼', '干事',
'帮手', '帐篷', '希罕', '师父', '师傅', '巴结', '巴掌', '差事', '工夫', '岁数', '屁股',
'尾巴', '少爷', '小气', '小伙', '将就', '对头', '对付', '寡妇', '家伙', '客气', '实在',
@@ -60,7 +60,7 @@ class ToneSandhi():
'邋遢', '费用', '冤家', '甜头', '介绍', '荒唐', '大人', '泥鳅', '幸福', '熟悉', '计划',
'扑腾', '蜡烛', '姥爷', '照顾', '喉咙', '吉他', '弄堂', '蚂蚱', '凤凰', '拖沓', '寒碜',
'糟蹋', '倒腾', '报复', '逻辑', '盘缠', '喽啰', '牢骚', '咖喱', '扫把', '惦记', '戏弄',
- '将军'
+ '将军', '别人'
}
self.must_not_neural_tone_words = {
'男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
@@ -84,7 +84,7 @@ class ToneSandhi():
if j - 1 >= 0 and item == word[j - 1] and pos[0] in {"n", "v", "a"}:
finals[j] = finals[j][:-1] + "5"
ge_idx = word.find("个")
- if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒滴哩哟喽啰耶喔诶":
+ if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
finals[-1] = finals[-1][:-1] + "5"
elif len(word) >= 1 and word[-1] in "的地得":
finals[-1] = finals[-1][:-1] + "5"
@@ -169,7 +169,6 @@ class ToneSandhi():
return new_word_list
def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
-
if len(word) == 2 and self._all_tone_three(finals):
finals[0] = finals[0][:-1] + "2"
elif len(word) == 3:
@@ -347,7 +346,6 @@ class ToneSandhi():
def modified_tone(self, word: str, pos: str,
finals: List[str]) -> List[str]:
-
finals = self._bu_sandhi(word, finals)
finals = self._yi_sandhi(word, finals)
finals = self._neural_sandhi(word, pos, finals)
diff --git a/paddlespeech/t2s/frontend/zh_normalization/num.py b/paddlespeech/t2s/frontend/zh_normalization/num.py
index 8a54d3e63..ec1367736 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/num.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/num.py
@@ -28,7 +28,7 @@ UNITS = OrderedDict({
8: '亿',
})
-COM_QUANTIFIERS = '(封|艘|把|目|套|段|人|所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|十|)吨|(亿|千万|百万|万|千|百|)块|角|毛|分)'
+COM_QUANTIFIERS = '(所|朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|小时|旬|纪|岁|世|更|夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|百万|万|千|百|)块|角|毛|分)'
# 分数表达式
RE_FRAC = re.compile(r'(-?)(\d+)/(\d+)')
diff --git a/paddlespeech/t2s/models/ernie_sat/__init__.py b/paddlespeech/t2s/models/ernie_sat/__init__.py
index 87e7afe85..7e795370e 100644
--- a/paddlespeech/t2s/models/ernie_sat/__init__.py
+++ b/paddlespeech/t2s/models/ernie_sat/__init__.py
@@ -13,3 +13,4 @@
# limitations under the License.
from .ernie_sat import *
from .ernie_sat_updater import *
+from .mlm import *
diff --git a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
index 08c43dc5f..54f5d542d 100644
--- a/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
+++ b/paddlespeech/t2s/models/ernie_sat/ernie_sat.py
@@ -389,7 +389,7 @@ class MLM(nn.Layer):
speech_seg_pos: paddle.Tensor,
text_seg_pos: paddle.Tensor,
span_bdy: List[int],
- use_teacher_forcing: bool=True, ) -> List[paddle.Tensor]:
+ use_teacher_forcing: bool=False, ) -> List[paddle.Tensor]:
'''
Args:
speech (paddle.Tensor): input speech (1, Tmax, D).
@@ -657,7 +657,7 @@ class ErnieSAT(nn.Layer):
speech_seg_pos: paddle.Tensor,
text_seg_pos: paddle.Tensor,
span_bdy: List[int],
- use_teacher_forcing: bool=True, ) -> Dict[str, paddle.Tensor]:
+ use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
return self.model.inference(
speech=speech,
text=text,
diff --git a/paddlespeech/t2s/models/ernie_sat/mlm.py b/paddlespeech/t2s/models/ernie_sat/mlm.py
new file mode 100644
index 000000000..647fdd9b4
--- /dev/null
+++ b/paddlespeech/t2s/models/ernie_sat/mlm.py
@@ -0,0 +1,579 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from typing import Dict
+from typing import List
+from typing import Optional
+
+import paddle
+import yaml
+from paddle import nn
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.modules.activation import get_activation
+from paddlespeech.t2s.modules.conformer.convolution import ConvolutionModule
+from paddlespeech.t2s.modules.conformer.encoder_layer import EncoderLayer
+from paddlespeech.t2s.modules.layer_norm import LayerNorm
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.attention import LegacyRelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import MultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.attention import RelPositionMultiHeadedAttention
+from paddlespeech.t2s.modules.transformer.embedding import LegacyRelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import RelPositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import Conv1dLinear
+from paddlespeech.t2s.modules.transformer.multi_layer_conv import MultiLayeredConv1d
+from paddlespeech.t2s.modules.transformer.positionwise_feed_forward import PositionwiseFeedForward
+from paddlespeech.t2s.modules.transformer.repeat import repeat
+from paddlespeech.t2s.modules.transformer.subsampling import Conv2dSubsampling
+
+
+# MLM -> Mask Language Model
+class mySequential(nn.Sequential):
+ def forward(self, *inputs):
+ for module in self._sub_layers.values():
+ if type(inputs) == tuple:
+ inputs = module(*inputs)
+ else:
+ inputs = module(inputs)
+ return inputs
+
+
+class MaskInputLayer(nn.Layer):
+ def __init__(self, out_features: int) -> None:
+ super().__init__()
+ self.mask_feature = paddle.create_parameter(
+ shape=(1, 1, out_features),
+ dtype=paddle.float32,
+ default_initializer=paddle.nn.initializer.Assign(
+ paddle.normal(shape=(1, 1, out_features))))
+
+ def forward(self, input: paddle.Tensor,
+ masked_pos: paddle.Tensor=None) -> paddle.Tensor:
+ masked_pos = paddle.expand_as(paddle.unsqueeze(masked_pos, -1), input)
+ masked_input = masked_fill(input, masked_pos, 0) + masked_fill(
+ paddle.expand_as(self.mask_feature, input), ~masked_pos, 0)
+ return masked_input
+
+
+class MLMEncoder(nn.Layer):
+ """Conformer encoder module.
+
+ Args:
+ idim (int): Input dimension.
+ attention_dim (int): Dimension of attention.
+ attention_heads (int): The number of heads of multi head attention.
+ linear_units (int): The number of units of position-wise feed forward.
+ num_blocks (int): The number of decoder blocks.
+ dropout_rate (float): Dropout rate.
+ positional_dropout_rate (float): Dropout rate after adding positional encoding.
+ attention_dropout_rate (float): Dropout rate in attention.
+ input_layer (Union[str, paddle.nn.Layer]): Input layer type.
+ normalize_before (bool): Whether to use layer_norm before the first block.
+ concat_after (bool): Whether to concat attention layer's input and output.
+ if True, additional linear will be applied.
+ i.e. x -> x + linear(concat(x, att(x)))
+ if False, no additional linear will be applied. i.e. x -> x + att(x)
+ positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+ positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+ macaron_style (bool): Whether to use macaron style for positionwise layer.
+ pos_enc_layer_type (str): Encoder positional encoding layer type.
+ selfattention_layer_type (str): Encoder attention layer type.
+ activation_type (str): Encoder activation function type.
+ use_cnn_module (bool): Whether to use convolution module.
+ zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+ cnn_module_kernel (int): Kernerl size of convolution module.
+ padding_idx (int): Padding idx for input_layer=embed.
+ stochastic_depth_rate (float): Maximum probability to skip the encoder layer.
+
+ """
+
+ def __init__(self,
+ idim: int,
+ vocab_size: int=0,
+ pre_speech_layer: int=0,
+ attention_dim: int=256,
+ attention_heads: int=4,
+ linear_units: int=2048,
+ num_blocks: int=6,
+ dropout_rate: float=0.1,
+ positional_dropout_rate: float=0.1,
+ attention_dropout_rate: float=0.0,
+ input_layer: str="conv2d",
+ normalize_before: bool=True,
+ concat_after: bool=False,
+ positionwise_layer_type: str="linear",
+ positionwise_conv_kernel_size: int=1,
+ macaron_style: bool=False,
+ pos_enc_layer_type: str="abs_pos",
+ selfattention_layer_type: str="selfattn",
+ activation_type: str="swish",
+ use_cnn_module: bool=False,
+ zero_triu: bool=False,
+ cnn_module_kernel: int=31,
+ padding_idx: int=-1,
+ stochastic_depth_rate: float=0.0,
+ text_masking: bool=False):
+ """Construct an Encoder object."""
+ super().__init__()
+ self._output_size = attention_dim
+ self.text_masking = text_masking
+ if self.text_masking:
+ self.text_masking_layer = MaskInputLayer(attention_dim)
+ activation = get_activation(activation_type)
+ if pos_enc_layer_type == "abs_pos":
+ pos_enc_class = PositionalEncoding
+ elif pos_enc_layer_type == "scaled_abs_pos":
+ pos_enc_class = ScaledPositionalEncoding
+ elif pos_enc_layer_type == "rel_pos":
+ assert selfattention_layer_type == "rel_selfattn"
+ pos_enc_class = RelPositionalEncoding
+ elif pos_enc_layer_type == "legacy_rel_pos":
+ pos_enc_class = LegacyRelPositionalEncoding
+ assert selfattention_layer_type == "legacy_rel_selfattn"
+ else:
+ raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+ self.conv_subsampling_factor = 1
+ if input_layer == "linear":
+ self.embed = nn.Sequential(
+ nn.Linear(idim, attention_dim),
+ nn.LayerNorm(attention_dim),
+ nn.Dropout(dropout_rate),
+ nn.ReLU(),
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ elif input_layer == "conv2d":
+ self.embed = Conv2dSubsampling(
+ idim,
+ attention_dim,
+ dropout_rate,
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ self.conv_subsampling_factor = 4
+ elif input_layer == "embed":
+ self.embed = nn.Sequential(
+ nn.Embedding(idim, attention_dim, padding_idx=padding_idx),
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ elif input_layer == "mlm":
+ self.segment_emb = None
+ self.speech_embed = mySequential(
+ MaskInputLayer(idim),
+ nn.Linear(idim, attention_dim),
+ nn.LayerNorm(attention_dim),
+ nn.ReLU(),
+ pos_enc_class(attention_dim, positional_dropout_rate))
+ self.text_embed = nn.Sequential(
+ nn.Embedding(
+ vocab_size, attention_dim, padding_idx=padding_idx),
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ elif input_layer == "sega_mlm":
+ self.segment_emb = nn.Embedding(
+ 500, attention_dim, padding_idx=padding_idx)
+ self.speech_embed = mySequential(
+ MaskInputLayer(idim),
+ nn.Linear(idim, attention_dim),
+ nn.LayerNorm(attention_dim),
+ nn.ReLU(),
+ pos_enc_class(attention_dim, positional_dropout_rate))
+ self.text_embed = nn.Sequential(
+ nn.Embedding(
+ vocab_size, attention_dim, padding_idx=padding_idx),
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ elif isinstance(input_layer, nn.Layer):
+ self.embed = nn.Sequential(
+ input_layer,
+ pos_enc_class(attention_dim, positional_dropout_rate), )
+ elif input_layer is None:
+ self.embed = nn.Sequential(
+ pos_enc_class(attention_dim, positional_dropout_rate))
+ else:
+ raise ValueError("unknown input_layer: " + input_layer)
+ self.normalize_before = normalize_before
+
+ # self-attention module definition
+ if selfattention_layer_type == "selfattn":
+ encoder_selfattn_layer = MultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, attention_dim,
+ attention_dropout_rate, )
+ elif selfattention_layer_type == "legacy_rel_selfattn":
+ assert pos_enc_layer_type == "legacy_rel_pos"
+ encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, attention_dim,
+ attention_dropout_rate, )
+ elif selfattention_layer_type == "rel_selfattn":
+ assert pos_enc_layer_type == "rel_pos"
+ encoder_selfattn_layer = RelPositionMultiHeadedAttention
+ encoder_selfattn_layer_args = (attention_heads, attention_dim,
+ attention_dropout_rate, zero_triu, )
+ else:
+ raise ValueError("unknown encoder_attn_layer: " +
+ selfattention_layer_type)
+
+ # feed-forward module definition
+ if positionwise_layer_type == "linear":
+ positionwise_layer = PositionwiseFeedForward
+ positionwise_layer_args = (attention_dim, linear_units,
+ dropout_rate, activation, )
+ elif positionwise_layer_type == "conv1d":
+ positionwise_layer = MultiLayeredConv1d
+ positionwise_layer_args = (attention_dim, linear_units,
+ positionwise_conv_kernel_size,
+ dropout_rate, )
+ elif positionwise_layer_type == "conv1d-linear":
+ positionwise_layer = Conv1dLinear
+ positionwise_layer_args = (attention_dim, linear_units,
+ positionwise_conv_kernel_size,
+ dropout_rate, )
+ else:
+ raise NotImplementedError("Support only linear or conv1d.")
+
+ # convolution module definition
+ convolution_layer = ConvolutionModule
+ convolution_layer_args = (attention_dim, cnn_module_kernel, activation)
+
+ self.encoders = repeat(
+ num_blocks,
+ lambda lnum: EncoderLayer(
+ attention_dim,
+ encoder_selfattn_layer(*encoder_selfattn_layer_args),
+ positionwise_layer(*positionwise_layer_args),
+ positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+ convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+ dropout_rate,
+ normalize_before,
+ concat_after,
+ stochastic_depth_rate * float(1 + lnum) / num_blocks, ), )
+ self.pre_speech_layer = pre_speech_layer
+ self.pre_speech_encoders = repeat(
+ self.pre_speech_layer,
+ lambda lnum: EncoderLayer(
+ attention_dim,
+ encoder_selfattn_layer(*encoder_selfattn_layer_args),
+ positionwise_layer(*positionwise_layer_args),
+ positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+ convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+ dropout_rate,
+ normalize_before,
+ concat_after,
+ stochastic_depth_rate * float(1 + lnum) / self.pre_speech_layer, ),
+ )
+ if self.normalize_before:
+ self.after_norm = LayerNorm(attention_dim)
+
+ def forward(self,
+ speech: paddle.Tensor,
+ text: paddle.Tensor,
+ masked_pos: paddle.Tensor,
+ speech_mask: paddle.Tensor=None,
+ text_mask: paddle.Tensor=None,
+ speech_seg_pos: paddle.Tensor=None,
+ text_seg_pos: paddle.Tensor=None):
+ """Encode input sequence.
+
+ """
+ if masked_pos is not None:
+ speech = self.speech_embed(speech, masked_pos)
+ else:
+ speech = self.speech_embed(speech)
+ if text is not None:
+ text = self.text_embed(text)
+ if speech_seg_pos is not None and text_seg_pos is not None and self.segment_emb:
+ speech_seg_emb = self.segment_emb(speech_seg_pos)
+ text_seg_emb = self.segment_emb(text_seg_pos)
+ text = (text[0] + text_seg_emb, text[1])
+ speech = (speech[0] + speech_seg_emb, speech[1])
+ if self.pre_speech_encoders:
+ speech, _ = self.pre_speech_encoders(speech, speech_mask)
+
+ if text is not None:
+ xs = paddle.concat([speech[0], text[0]], axis=1)
+ xs_pos_emb = paddle.concat([speech[1], text[1]], axis=1)
+ masks = paddle.concat([speech_mask, text_mask], axis=-1)
+ else:
+ xs = speech[0]
+ xs_pos_emb = speech[1]
+ masks = speech_mask
+
+ xs, masks = self.encoders((xs, xs_pos_emb), masks)
+
+ if isinstance(xs, tuple):
+ xs = xs[0]
+ if self.normalize_before:
+ xs = self.after_norm(xs)
+
+ return xs, masks
+
+
+class MLMDecoder(MLMEncoder):
+ def forward(self, xs: paddle.Tensor, masks: paddle.Tensor):
+ """Encode input sequence.
+
+ Args:
+ xs (paddle.Tensor): Input tensor (#batch, time, idim).
+ masks (paddle.Tensor): Mask tensor (#batch, time).
+
+ Returns:
+ paddle.Tensor: Output tensor (#batch, time, attention_dim).
+ paddle.Tensor: Mask tensor (#batch, time).
+
+ """
+ xs = self.embed(xs)
+ xs, masks = self.encoders(xs, masks)
+
+ if isinstance(xs, tuple):
+ xs = xs[0]
+ if self.normalize_before:
+ xs = self.after_norm(xs)
+
+ return xs, masks
+
+
+# encoder and decoder is nn.Layer, not str
+class MLM(nn.Layer):
+ def __init__(self,
+ odim: int,
+ encoder: nn.Layer,
+ decoder: Optional[nn.Layer],
+ postnet_layers: int=0,
+ postnet_chans: int=0,
+ postnet_filts: int=0,
+ text_masking: bool=False):
+
+ super().__init__()
+ self.odim = odim
+ self.encoder = encoder
+ self.decoder = decoder
+ self.vocab_size = encoder.text_embed[0]._num_embeddings
+
+ if self.decoder is None or not (hasattr(self.decoder,
+ 'output_layer') and
+ self.decoder.output_layer is not None):
+ self.sfc = nn.Linear(self.encoder._output_size, odim)
+ else:
+ self.sfc = None
+ if text_masking:
+ self.text_sfc = nn.Linear(
+ self.encoder.text_embed[0]._embedding_dim,
+ self.vocab_size,
+ weight_attr=self.encoder.text_embed[0]._weight_attr)
+ else:
+ self.text_sfc = None
+
+ self.postnet = (None if postnet_layers == 0 else Postnet(
+ idim=self.encoder._output_size,
+ odim=odim,
+ n_layers=postnet_layers,
+ n_chans=postnet_chans,
+ n_filts=postnet_filts,
+ use_batch_norm=True,
+ dropout_rate=0.5, ))
+
+ def inference(
+ self,
+ speech: paddle.Tensor,
+ text: paddle.Tensor,
+ masked_pos: paddle.Tensor,
+ speech_mask: paddle.Tensor,
+ text_mask: paddle.Tensor,
+ speech_seg_pos: paddle.Tensor,
+ text_seg_pos: paddle.Tensor,
+ span_bdy: List[int],
+ use_teacher_forcing: bool=False, ) -> Dict[str, paddle.Tensor]:
+ '''
+ Args:
+ speech (paddle.Tensor): input speech (1, Tmax, D).
+ text (paddle.Tensor): input text (1, Tmax2).
+ masked_pos (paddle.Tensor): masked position of input speech (1, Tmax)
+ speech_mask (paddle.Tensor): mask of speech (1, 1, Tmax).
+ text_mask (paddle.Tensor): mask of text (1, 1, Tmax2).
+ speech_seg_pos (paddle.Tensor): n-th phone of each mel, 0<=n<=Tmax2 (1, Tmax).
+ text_seg_pos (paddle.Tensor): n-th phone of each phone, 0<=n<=Tmax2 (1, Tmax2).
+ span_bdy (List[int]): masked mel boundary of input speech (2,)
+ use_teacher_forcing (bool): whether to use teacher forcing
+ Returns:
+ List[Tensor]:
+ eg:
+ [Tensor(shape=[1, 181, 80]), Tensor(shape=[80, 80]), Tensor(shape=[1, 67, 80])]
+ '''
+
+ z_cache = None
+ if use_teacher_forcing:
+ before_outs, zs, *_ = self.forward(
+ speech=speech,
+ text=text,
+ masked_pos=masked_pos,
+ speech_mask=speech_mask,
+ text_mask=text_mask,
+ speech_seg_pos=speech_seg_pos,
+ text_seg_pos=text_seg_pos)
+ if zs is None:
+ zs = before_outs
+
+ speech = speech.squeeze(0)
+ outs = [speech[:span_bdy[0]]]
+ outs += [zs[0][span_bdy[0]:span_bdy[1]]]
+ outs += [speech[span_bdy[1]:]]
+ return outs
+ return None
+
+
+class MLMEncAsDecoder(MLM):
+ def forward(self,
+ speech: paddle.Tensor,
+ text: paddle.Tensor,
+ masked_pos: paddle.Tensor,
+ speech_mask: paddle.Tensor,
+ text_mask: paddle.Tensor,
+ speech_seg_pos: paddle.Tensor,
+ text_seg_pos: paddle.Tensor):
+ # feats: (Batch, Length, Dim)
+ # -> encoder_out: (Batch, Length2, Dim2)
+ encoder_out, h_masks = self.encoder(
+ speech=speech,
+ text=text,
+ masked_pos=masked_pos,
+ speech_mask=speech_mask,
+ text_mask=text_mask,
+ speech_seg_pos=speech_seg_pos,
+ text_seg_pos=text_seg_pos)
+ if self.decoder is not None:
+ zs, _ = self.decoder(encoder_out, h_masks)
+ else:
+ zs = encoder_out
+ speech_hidden_states = zs[:, :paddle.shape(speech)[1], :]
+ if self.sfc is not None:
+ before_outs = paddle.reshape(
+ self.sfc(speech_hidden_states),
+ (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+ else:
+ before_outs = speech_hidden_states
+ if self.postnet is not None:
+ after_outs = before_outs + paddle.transpose(
+ self.postnet(paddle.transpose(before_outs, [0, 2, 1])),
+ [0, 2, 1])
+ else:
+ after_outs = None
+ return before_outs, after_outs, None
+
+
+class MLMDualMaksing(MLM):
+ def forward(self,
+ speech: paddle.Tensor,
+ text: paddle.Tensor,
+ masked_pos: paddle.Tensor,
+ speech_mask: paddle.Tensor,
+ text_mask: paddle.Tensor,
+ speech_seg_pos: paddle.Tensor,
+ text_seg_pos: paddle.Tensor):
+ # feats: (Batch, Length, Dim)
+ # -> encoder_out: (Batch, Length2, Dim2)
+ encoder_out, h_masks = self.encoder(
+ speech=speech,
+ text=text,
+ masked_pos=masked_pos,
+ speech_mask=speech_mask,
+ text_mask=text_mask,
+ speech_seg_pos=speech_seg_pos,
+ text_seg_pos=text_seg_pos)
+ if self.decoder is not None:
+ zs, _ = self.decoder(encoder_out, h_masks)
+ else:
+ zs = encoder_out
+ speech_hidden_states = zs[:, :paddle.shape(speech)[1], :]
+ if self.text_sfc:
+ text_hiddent_states = zs[:, paddle.shape(speech)[1]:, :]
+ text_outs = paddle.reshape(
+ self.text_sfc(text_hiddent_states),
+ (paddle.shape(text_hiddent_states)[0], -1, self.vocab_size))
+ if self.sfc is not None:
+ before_outs = paddle.reshape(
+ self.sfc(speech_hidden_states),
+ (paddle.shape(speech_hidden_states)[0], -1, self.odim))
+ else:
+ before_outs = speech_hidden_states
+ if self.postnet is not None:
+ after_outs = before_outs + paddle.transpose(
+ self.postnet(paddle.transpose(before_outs, [0, 2, 1])),
+ [0, 2, 1])
+ else:
+ after_outs = None
+ return before_outs, after_outs, text_outs
+
+
+def build_model_from_file(config_file, model_file):
+
+ state_dict = paddle.load(model_file)
+ model_class = MLMDualMaksing if 'conformer_combine_vctk_aishell3_dual_masking' in config_file \
+ else MLMEncAsDecoder
+
+ # 构建模型
+ with open(config_file) as f:
+ conf = CfgNode(yaml.safe_load(f))
+ model = build_model(conf, model_class)
+ model.set_state_dict(state_dict)
+ return model, conf
+
+
+# select encoder and decoder here
+def build_model(args: argparse.Namespace, model_class=MLMEncAsDecoder) -> MLM:
+ if isinstance(args.token_list, str):
+ with open(args.token_list, encoding="utf-8") as f:
+ token_list = [line.rstrip() for line in f]
+
+ # Overwriting token_list to keep it as "portable".
+ args.token_list = list(token_list)
+ elif isinstance(args.token_list, (tuple, list)):
+ token_list = list(args.token_list)
+ else:
+ raise RuntimeError("token_list must be str or list")
+
+ vocab_size = len(token_list)
+ odim = 80
+
+ # Encoder
+ encoder_class = MLMEncoder
+
+ if 'text_masking' in args.model_conf.keys() and args.model_conf[
+ 'text_masking']:
+ args.encoder_conf['text_masking'] = True
+ else:
+ args.encoder_conf['text_masking'] = False
+
+ encoder = encoder_class(
+ args.input_size, vocab_size=vocab_size, **args.encoder_conf)
+
+ # Decoder
+ if args.decoder != 'no_decoder':
+ decoder_class = MLMDecoder
+ decoder = decoder_class(
+ idim=0,
+ input_layer=None,
+ **args.decoder_conf, )
+ else:
+ decoder = None
+
+ # Build model
+ model = model_class(
+ odim=odim,
+ encoder=encoder,
+ decoder=decoder,
+ **args.model_conf, )
+
+ # Initialize
+ if args.init is not None:
+ initialize(model, args.init)
+
+ return model
diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py
index 359b66258..f87de91a2 100644
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@@ -522,82 +522,6 @@ class VITSGenerator(nn.Layer):
return wav.squeeze(1), attn.squeeze(1), dur.squeeze(1)
- def voice_conversion(
- self,
- feats: paddle.Tensor=None,
- feats_lengths: paddle.Tensor=None,
- sids_src: Optional[paddle.Tensor]=None,
- sids_tgt: Optional[paddle.Tensor]=None,
- spembs_src: Optional[paddle.Tensor]=None,
- spembs_tgt: Optional[paddle.Tensor]=None,
- lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
- """Run voice conversion.
- Args:
- feats (Tensor): Feature tensor (B, aux_channels, T_feats,).
- feats_lengths (Tensor): Feature length tensor (B,).
- sids_src (Optional[Tensor]): Speaker index tensor of source feature (B,) or (B, 1).
- sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (B,) or (B, 1).
- spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (B, spk_embed_dim).
- spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (B, spk_embed_dim).
- lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
- Returns:
- Tensor: Generated waveform tensor (B, T_wav).
- """
- # encoder
- g_src = None
- g_tgt = None
- if self.spks is not None:
- # (B, global_channels, 1)
- g_src = self.global_emb(
- paddle.reshape(sids_src, [-1])).unsqueeze(-1)
- g_tgt = self.global_emb(
- paddle.reshape(sids_tgt, [-1])).unsqueeze(-1)
-
- if self.spk_embed_dim is not None:
- # (B, global_channels, 1)
- g_src_ = self.spemb_proj(
- F.normalize(spembs_src.unsqueeze(0))).unsqueeze(-1)
- if g_src is None:
- g_src = g_src_
- else:
- g_src = g_src + g_src_
-
- # (B, global_channels, 1)
- g_tgt_ = self.spemb_proj(
- F.normalize(spembs_tgt.unsqueeze(0))).unsqueeze(-1)
- if g_tgt is None:
- g_tgt = g_tgt_
- else:
- g_tgt = g_tgt + g_tgt_
-
- if self.langs is not None:
- # (B, global_channels, 1)
- g_ = self.lang_emb(paddle.reshape(lids, [-1])).unsqueeze(-1)
-
- if g_src is None:
- g_src = g_
- else:
- g_src = g_src + g_
-
- if g_tgt is None:
- g_tgt = g_
- else:
- g_tgt = g_tgt + g_
-
- # forward posterior encoder
- z, m_q, logs_q, y_mask = self.posterior_encoder(
- feats, feats_lengths, g=g_src)
-
- # forward flow
- # (B, H, T_feats)
- z_p = self.flow(z, y_mask, g=g_src)
-
- # decoder
- z_hat = self.flow(z_p, y_mask, g=g_tgt, inverse=True)
- wav = self.decoder(z_hat * y_mask, g=g_tgt)
-
- return wav.squeeze(1)
-
def _generate_path(self, dur: paddle.Tensor,
mask: paddle.Tensor) -> paddle.Tensor:
"""Generate path a.k.a. monotonic attention.
diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py
index 983bf0a36..5c476be77 100644
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@@ -381,7 +381,7 @@ class VITS(nn.Layer):
if use_teacher_forcing:
assert feats is not None
feats = feats[None].transpose([0, 2, 1])
- feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
+ feats_lengths = paddle.to_tensor([paddle.shape(feats)[2]])
wav, att_w, dur = self.generator.inference(
text=text,
text_lengths=text_lengths,
@@ -406,43 +406,3 @@ class VITS(nn.Layer):
max_len=max_len, )
return dict(
wav=paddle.reshape(wav, [-1]), att_w=att_w[0], duration=dur[0])
-
- def voice_conversion(
- self,
- feats: paddle.Tensor,
- sids_src: Optional[paddle.Tensor]=None,
- sids_tgt: Optional[paddle.Tensor]=None,
- spembs_src: Optional[paddle.Tensor]=None,
- spembs_tgt: Optional[paddle.Tensor]=None,
- lids: Optional[paddle.Tensor]=None, ) -> paddle.Tensor:
- """Run voice conversion.
- Args:
- feats (Tensor): Feature tensor (T_feats, aux_channels).
- sids_src (Optional[Tensor]): Speaker index tensor of source feature (1,).
- sids_tgt (Optional[Tensor]): Speaker index tensor of target feature (1,).
- spembs_src (Optional[Tensor]): Speaker embedding tensor of source feature (spk_embed_dim,).
- spembs_tgt (Optional[Tensor]): Speaker embedding tensor of target feature (spk_embed_dim,).
- lids (Optional[Tensor]): Language index tensor (1,).
- Returns:
- Dict[str, Tensor]:
- * wav (Tensor): Generated waveform tensor (T_wav,).
- """
- assert feats is not None
- feats = feats[None].transpose([0, 2, 1])
- feats_lengths = paddle.to_tensor(paddle.shape(feats)[2])
-
- sids_none = sids_src is None and sids_tgt is None
- spembs_none = spembs_src is None and spembs_tgt is None
-
- assert not sids_none or not spembs_none
-
- wav = self.generator.voice_conversion(
- feats,
- feats_lengths,
- sids_src,
- sids_tgt,
- spembs_src,
- spembs_tgt,
- lids, )
-
- return dict(wav=paddle.reshape(wav, [-1]))
diff --git a/paddlespeech/t2s/models/vits/vits_updater.py b/paddlespeech/t2s/models/vits/vits_updater.py
index 9f8be6803..76271fd97 100644
--- a/paddlespeech/t2s/models/vits/vits_updater.py
+++ b/paddlespeech/t2s/models/vits/vits_updater.py
@@ -111,8 +111,6 @@ class VITSUpdater(StandardUpdater):
text_lengths=batch["text_lengths"],
feats=batch["feats"],
feats_lengths=batch["feats_lengths"],
- sids=batch.get("spk_id", None),
- spembs=batch.get("spk_emb", None),
forward_generator=turn == "generator")
# Generator
if turn == "generator":
@@ -270,8 +268,6 @@ class VITSEvaluator(StandardEvaluator):
text_lengths=batch["text_lengths"],
feats=batch["feats"],
feats_lengths=batch["feats_lengths"],
- sids=batch.get("spk_id", None),
- spembs=batch.get("spk_emb", None),
forward_generator=turn == "generator")
# Generator
if turn == "generator":
diff --git a/paddlespeech/t2s/training/updaters/standard_updater.py b/paddlespeech/t2s/training/updaters/standard_updater.py
index 6d3aa7099..668d2fc69 100644
--- a/paddlespeech/t2s/training/updaters/standard_updater.py
+++ b/paddlespeech/t2s/training/updaters/standard_updater.py
@@ -24,11 +24,10 @@ from paddle.nn import Layer
from paddle.optimizer import Optimizer
from timer import timer
-from paddlespeech.t2s.datasets.sampler import ErnieSATSampler
from paddlespeech.t2s.training.reporter import report
from paddlespeech.t2s.training.updater import UpdaterBase
from paddlespeech.t2s.training.updater import UpdaterState
-
+from paddlespeech.t2s.datasets.sampler import ErnieSATSampler
class StandardUpdater(UpdaterBase):
"""An example of over-simplification. Things may not be that simple, but