diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index 3694e106a..a5ab7b260 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -13,6 +13,8 @@ # limitations under the License. """Contains data helper functions.""" +import numpy as np +import math import json import codecs import os @@ -50,3 +52,85 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): json_data["duration"] >= min_duration): manifest.append(json_data) return manifest + + +def rms_to_db(rms: float): + """Root Mean Square to dB. + Args: + rms ([float]): root mean square + + Returns: + float: dB + """ + return 20.0 * math.log10(max(1e-16, rms)) + + +def rms_to_dbfs(rms: float): + """Root Mean Square to dBFS. + https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/ + Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB. + + dB = dBFS + 3.0103 + dBFS = db - 3.0103 + e.g. 0 dB = -3.0103 dBFS + + Args: + rms ([float]): root mean square + + Returns: + float: dBFS + """ + return rms_to_db(rms) - 3.0103 + + +def max_dbfs(sample_data: np.ndarry): + """Peak dBFS based on the maximum energy sample. + + Args: + sample_data ([np.ndarry]): float array, [-1, 1]. + + Returns: + float: dBFS + """ + # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization. + return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data)))) + + +def mean_dbfs(sample_data): + """Peak dBFS based on the RMS energy. + + Args: + sample_data ([np.ndarry]): float array, [-1, 1]. + + Returns: + float: dBFS + """ + return rms_to_dbfs( + math.sqrt(np.mean(np.square(sample_data, dtype=np.float64)))) + + +def gain_db_to_ratio(gain_db: float): + """dB to ratio + + Args: + gain_db (float): gain in dB + + Returns: + float: scale in amp + """ + return math.pow(10.0, gain_db / 20.0) + + +def normalize_audio(sample_data: np.ndarry, dbfs: float=-3.0103): + """Nomalize audio to dBFS. + + Args: + sample_data (np.ndarry): input wave samples, [-1, 1]. + dbfs (float, optional): target dBFS. Defaults to -3.0103. + + Returns: + np.ndarry: normalized wave + """ + return np.maximum( + np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)), + 1.0), -1.0) diff --git a/deepspeech/modules/encoder_layer.py b/deepspeech/modules/encoder_layer.py index bd117b976..f18b75999 100644 --- a/deepspeech/modules/encoder_layer.py +++ b/deepspeech/modules/encoder_layer.py @@ -133,7 +133,7 @@ class ConformerEncoderLayer(nn.Layer): def __init__( self, size: int, - self_attn: int, + self_attn: nn.Layer, feed_forward: Optional[nn.Layer]=None, feed_forward_macaron: Optional[nn.Layer]=None, conv_module: Optional[nn.Layer]=None, diff --git a/deepspeech/utils/common.py b/deepspeech/utils/common.py index 801b32e95..b4673e2be 100644 --- a/deepspeech/utils/common.py +++ b/deepspeech/utils/common.py @@ -110,4 +110,4 @@ def log_add(args: List[int]) -> float: return -float('inf') a_max = max(args) lsp = math.log(sum(math.exp(a - a_max) for a in args)) - return a_max + lsp + return a_max + lsp \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index f0d92f7c6..585becbaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ tensorboardX yacs typeguard pre-commit +paddlepaddle-gpu==2.0.0