You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
119 lines
4.6 KiB
119 lines
4.6 KiB
4 years ago
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||
|
#
|
||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
8 years ago
|
"""Contains the speech featurizer class."""
|
||
8 years ago
|
|
||
4 years ago
|
from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
|
||
|
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
|
||
8 years ago
|
|
||
|
|
||
|
class SpeechFeaturizer(object):
|
||
8 years ago
|
"""Speech featurizer, for extracting features from both audio and transcript
|
||
|
contents of SpeechSegment.
|
||
|
|
||
7 years ago
|
Currently, for audio parts, it supports feature types of linear
|
||
|
spectrogram and mfcc; for transcript parts, it only supports char-level
|
||
|
tokenizing and conversion into a list of token indices. Note that the
|
||
|
token indexing order follows the given vocabulary file.
|
||
8 years ago
|
|
||
|
:param vocab_filepath: Filepath to load vocabulary for token indices
|
||
|
conversion.
|
||
4 years ago
|
:type specgram_type: str
|
||
7 years ago
|
:param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
|
||
8 years ago
|
:type specgram_type: str
|
||
|
:param stride_ms: Striding size (in milliseconds) for generating frames.
|
||
|
:type stride_ms: float
|
||
|
:param window_ms: Window size (in milliseconds) for generating frames.
|
||
|
:type window_ms: float
|
||
7 years ago
|
:param max_freq: When specgram_type is 'linear', only FFT bins
|
||
8 years ago
|
corresponding to frequencies between [0, max_freq] are
|
||
7 years ago
|
returned; when specgram_type is 'mfcc', max_freq is the
|
||
|
highest band edge of mel filters.
|
||
8 years ago
|
:types max_freq: None|float
|
||
8 years ago
|
:param target_sample_rate: Speech are resampled (if upsampling or
|
||
|
downsampling is allowed) to this before
|
||
|
extracting spectrogram features.
|
||
|
:type target_sample_rate: float
|
||
|
:param use_dB_normalization: Whether to normalize the audio to a certain
|
||
|
decibels before extracting the features.
|
||
|
:type use_dB_normalization: bool
|
||
|
:param target_dB: Target audio decibels for normalization.
|
||
|
:type target_dB: float
|
||
8 years ago
|
"""
|
||
|
|
||
8 years ago
|
def __init__(self,
|
||
|
vocab_filepath,
|
||
|
specgram_type='linear',
|
||
|
stride_ms=10.0,
|
||
|
window_ms=20.0,
|
||
4 years ago
|
n_fft=None,
|
||
8 years ago
|
max_freq=None,
|
||
|
target_sample_rate=16000,
|
||
|
use_dB_normalization=True,
|
||
|
target_dB=-20):
|
||
|
self._audio_featurizer = AudioFeaturizer(
|
||
|
specgram_type=specgram_type,
|
||
|
stride_ms=stride_ms,
|
||
|
window_ms=window_ms,
|
||
4 years ago
|
n_fft=n_fft,
|
||
8 years ago
|
max_freq=max_freq,
|
||
|
target_sample_rate=target_sample_rate,
|
||
|
use_dB_normalization=use_dB_normalization,
|
||
|
target_dB=target_dB)
|
||
8 years ago
|
self._text_featurizer = TextFeaturizer(vocab_filepath)
|
||
|
|
||
7 years ago
|
def featurize(self, speech_segment, keep_transcription_text):
|
||
8 years ago
|
"""Extract features for speech segment.
|
||
|
|
||
|
1. For audio parts, extract the audio features.
|
||
7 years ago
|
2. For transcript parts, keep the original text or convert text string
|
||
|
to a list of token indices in char-level.
|
||
8 years ago
|
|
||
|
:param audio_segment: Speech segment to extract features from.
|
||
|
:type audio_segment: SpeechSegment
|
||
|
:return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of
|
||
|
char-level token indices.
|
||
|
:rtype: tuple
|
||
|
"""
|
||
8 years ago
|
audio_feature = self._audio_featurizer.featurize(speech_segment)
|
||
7 years ago
|
if keep_transcription_text:
|
||
|
return audio_feature, speech_segment.transcript
|
||
8 years ago
|
text_ids = self._text_featurizer.featurize(speech_segment.transcript)
|
||
8 years ago
|
return audio_feature, text_ids
|
||
|
|
||
|
@property
|
||
|
def vocab_size(self):
|
||
8 years ago
|
"""Return the vocabulary size.
|
||
|
|
||
|
:return: Vocabulary size.
|
||
|
:rtype: int
|
||
|
"""
|
||
8 years ago
|
return self._text_featurizer.vocab_size
|
||
|
|
||
|
@property
|
||
|
def vocab_list(self):
|
||
8 years ago
|
"""Return the vocabulary in list.
|
||
|
|
||
|
:return: Vocabulary in list.
|
||
|
:rtype: list
|
||
|
"""
|
||
8 years ago
|
return self._text_featurizer.vocab_list
|
||
4 years ago
|
|
||
|
@property
|
||
|
def feature_size(self):
|
||
|
"""Return the audio feature size.
|
||
|
|
||
|
:return: audio feature size.
|
||
|
:rtype: int
|
||
|
"""
|
||
|
return self._audio_featurizer.feature_size
|