Merge branch 'develop' into docker

pull/4003/head
liyulingyue 6 months ago
commit eaebcf409a

@ -233,7 +233,7 @@ def spectrogram(waveform: Tensor,
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True. to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000. sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey". window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey".
@ -443,7 +443,7 @@ def fbank(waveform: Tensor,
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True. to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000. sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
@ -566,7 +566,7 @@ def mfcc(waveform: Tensor,
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True. to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000. sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True. is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False. subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False. use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.

@ -527,7 +527,7 @@ def melspectrogram(x: np.ndarray,
if fmax is None: if fmax is None:
fmax = sr // 2 fmax = sr // 2
if fmin < 0 or fmin >= fmax: if fmin < 0 or fmin >= fmax:
raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax') raise ParameterError('fmin and fmax must satisfy 0<fmin<fmax')
s = stft( s = stft(
x, x,

@ -35,7 +35,7 @@ class ESC50(AudioClassificationDataset):
http://dx.doi.org/10.1145/2733373.2806390 http://dx.doi.org/10.1145/2733373.2806390
""" """
archieves = [ archives = [
{ {
'url': 'url':
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip', 'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
@ -133,7 +133,7 @@ class ESC50(AudioClassificationDataset):
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)): not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME) download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info() meta_info = self._get_meta_info()

@ -35,7 +35,7 @@ class GTZAN(AudioClassificationDataset):
https://ieeexplore.ieee.org/document/1021072/ https://ieeexplore.ieee.org/document/1021072/
""" """
archieves = [ archives = [
{ {
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz', 'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
'md5': '5b3d6dddb579ab49814ab86dba69e7c7', 'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
@ -85,7 +85,7 @@ class GTZAN(AudioClassificationDataset):
split) -> Tuple[List[str], List[int]]: split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)): not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME) download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info() meta_info = self._get_meta_info()
random.seed(seed) # shuffle samples to split data random.seed(seed) # shuffle samples to split data

@ -30,7 +30,7 @@ __all__ = ['OpenRIRNoise']
class OpenRIRNoise(Dataset): class OpenRIRNoise(Dataset):
archieves = [ archives = [
{ {
'url': 'http://www.openslr.org/resources/28/rirs_noises.zip', 'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
'md5': 'e6f48e257286e05de56413b4779d8ffb', 'md5': 'e6f48e257286e05de56413b4779d8ffb',
@ -76,7 +76,7 @@ class OpenRIRNoise(Dataset):
print(f"rirs noises base path: {self.base_path}") print(f"rirs noises base path: {self.base_path}")
if not os.path.isdir(self.base_path): if not os.path.isdir(self.base_path):
download_and_decompress( download_and_decompress(
self.archieves, self.base_path, decompress=True) self.archives, self.base_path, decompress=True)
else: else:
print( print(
f"{self.base_path} already exists, we will not download and decompress again" f"{self.base_path} already exists, we will not download and decompress again"

@ -37,7 +37,7 @@ class TESS(AudioClassificationDataset):
https://doi.org/10.5683/SP2/E8H2MF https://doi.org/10.5683/SP2/E8H2MF
""" """
archieves = [ archives = [
{ {
'url': 'url':
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip', 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
@ -93,7 +93,7 @@ class TESS(AudioClassificationDataset):
def _get_data(self, mode, seed, n_folds, def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]: split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)): if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
download_and_decompress(self.archieves, DATA_HOME) download_and_decompress(self.archives, DATA_HOME)
wav_files = [] wav_files = []
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)): for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):

@ -35,7 +35,7 @@ class UrbanSound8K(AudioClassificationDataset):
https://dl.acm.org/doi/10.1145/2647868.2655045 https://dl.acm.org/doi/10.1145/2647868.2655045
""" """
archieves = [ archives = [
{ {
'url': 'url':
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz', 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
@ -81,7 +81,7 @@ class UrbanSound8K(AudioClassificationDataset):
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)): not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME) download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info() meta_info = self._get_meta_info()

@ -34,7 +34,7 @@ __all__ = ['VoxCeleb']
class VoxCeleb(Dataset): class VoxCeleb(Dataset):
source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/' source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
archieves_audio_dev = [ archives_audio_dev = [
{ {
'url': source_url + 'vox1_dev_wav_partaa', 'url': source_url + 'vox1_dev_wav_partaa',
'md5': 'e395d020928bc15670b570a21695ed96', 'md5': 'e395d020928bc15670b570a21695ed96',
@ -52,13 +52,13 @@ class VoxCeleb(Dataset):
'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19', 'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
}, },
] ]
archieves_audio_test = [ archives_audio_test = [
{ {
'url': source_url + 'vox1_test_wav.zip', 'url': source_url + 'vox1_test_wav.zip',
'md5': '185fdc63c3c739954633d50379a3d102', 'md5': '185fdc63c3c739954633d50379a3d102',
}, },
] ]
archieves_meta = [ archives_meta = [
{ {
'url': 'url':
'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt', 'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
@ -135,11 +135,11 @@ class VoxCeleb(Dataset):
if not os.path.isdir(self.wav_path): if not os.path.isdir(self.wav_path):
print("start to download the voxceleb1 dataset") print("start to download the voxceleb1 dataset")
download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip
self.archieves_audio_dev, self.archives_audio_dev,
self.base_path, self.base_path,
decompress=False) decompress=False)
download_and_decompress( # download the vox1_test_wav.zip and unzip download_and_decompress( # download the vox1_test_wav.zip and unzip
self.archieves_audio_test, self.archives_audio_test,
self.base_path, self.base_path,
decompress=True) decompress=True)
@ -157,7 +157,7 @@ class VoxCeleb(Dataset):
if not os.path.isdir(self.meta_path): if not os.path.isdir(self.meta_path):
print("prepare the meta data") print("prepare the meta data")
download_and_decompress( download_and_decompress(
self.archieves_meta, self.meta_path, decompress=False) self.archives_meta, self.meta_path, decompress=False)
# Data preparation. # Data preparation.
if not os.path.isdir(self.csv_path): if not os.path.isdir(self.csv_path):

@ -109,7 +109,7 @@ def create_manifest(data_dir, manifest_path):
def prepare_chime3(url, md5sum, target_dir, manifest_path): def prepare_chime3(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create summmary manifest file.""" """Download, unpack and create summary manifest file."""
if not os.path.exists(os.path.join(target_dir, "CHiME3")): if not os.path.exists(os.path.join(target_dir, "CHiME3")):
# download # download
filepath = download(url, md5sum, target_dir, filepath = download(url, md5sum, target_dir,

@ -210,7 +210,7 @@ def create_manifest(data_dir, manifest_path_prefix):
def prepare_dataset(url, md5sum, target_dir, manifest_path): def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create summmary manifest file. """Download, unpack and create summary manifest file.
""" """
filepath = os.path.join(target_dir, "TIMIT.zip") filepath = os.path.join(target_dir, "TIMIT.zip")
if not os.path.exists(filepath): if not os.path.exists(filepath):

@ -115,27 +115,27 @@ int FrontEngineInterface::init() {
// 生成词典(词到音素的映射) // 生成词典(词到音素的映射)
if (0 != GenDict(_word2phone_path, &word_phone_map)) { if (0 != GenDict(_word2phone_path, &word_phone_map)) {
LOG(ERROR) << "Genarate word2phone dict failed"; LOG(ERROR) << "Generate word2phone dict failed";
return -1; return -1;
} }
// 生成音素字典音素到音素id的映射 // 生成音素字典音素到音素id的映射
if (0 != GenDict(_phone2id_path, &phone_id_map)) { if (0 != GenDict(_phone2id_path, &phone_id_map)) {
LOG(ERROR) << "Genarate phone2id dict failed"; LOG(ERROR) << "Generate phone2id dict failed";
return -1; return -1;
} }
// 生成音调字典音调到音调id的映射 // 生成音调字典音调到音调id的映射
if (_separate_tone == "true") { if (_separate_tone == "true") {
if (0 != GenDict(_tone2id_path, &tone_id_map)) { if (0 != GenDict(_tone2id_path, &tone_id_map)) {
LOG(ERROR) << "Genarate tone2id dict failed"; LOG(ERROR) << "Generate tone2id dict failed";
return -1; return -1;
} }
} }
// 生成繁简字典繁体到简体id的映射 // 生成繁简字典繁体到简体id的映射
if (0 != GenDict(_trand2simp_path, &trand_simp_map)) { if (0 != GenDict(_trand2simp_path, &trand_simp_map)) {
LOG(ERROR) << "Genarate trand2simp dict failed"; LOG(ERROR) << "Generate trand2simp dict failed";
return -1; return -1;
} }
@ -263,7 +263,7 @@ int FrontEngineInterface::GetWordsIds(
if (0 != if (0 !=
GetInitialsFinals(word, &word_initials, &word_finals)) { GetInitialsFinals(word, &word_initials, &word_finals)) {
LOG(ERROR) LOG(ERROR)
<< "Genarate the word_initials and word_finals of " << "Generate the word_initials and word_finals of "
<< word << " failed"; << word << " failed";
return -1; return -1;
} }
@ -304,7 +304,7 @@ int FrontEngineInterface::GetWordsIds(
// 音素到音素id // 音素到音素id
if (0 != Phone2Phoneid(phone, phoneids, toneids)) { if (0 != Phone2Phoneid(phone, phoneids, toneids)) {
LOG(ERROR) << "Genarate the phone id of " << word << " failed"; LOG(ERROR) << "Generate the phone id of " << word << " failed";
return -1; return -1;
} }
} }
@ -916,11 +916,11 @@ int FrontEngineInterface::NeuralSandhi(const std::string &word,
if (find(must_neural_tone_words.begin(), if (find(must_neural_tone_words.begin(),
must_neural_tone_words.end(), must_neural_tone_words.end(),
word) != must_neural_tone_words.end() || word) != must_neural_tone_words.end() ||
(word_num >= 2 && (word_num >= 2 && find(must_neural_tone_words.begin(),
find(must_neural_tone_words.begin(),
must_neural_tone_words.end(), must_neural_tone_words.end(),
ppspeech::wstring2utf8string(word_wstr.substr( ppspeech::wstring2utf8string(
word_num - 2))) != must_neural_tone_words.end())) { word_wstr.substr(word_num - 2))) !=
must_neural_tone_words.end())) {
(*finals).back() = (*finals).back() =
(*finals).back().replace((*finals).back().length() - 1, 1, "5"); (*finals).back().replace((*finals).back().length() - 1, 1, "5");
} }

@ -77,13 +77,13 @@ class MilvusHelper:
field1 = FieldSchema( field1 = FieldSchema(
name="id", name="id",
dtype=DataType.INT64, dtype=DataType.INT64,
descrition="int64", description="int64",
is_primary=True, is_primary=True,
auto_id=True) auto_id=True)
field2 = FieldSchema( field2 = FieldSchema(
name="embedding", name="embedding",
dtype=DataType.FLOAT_VECTOR, dtype=DataType.FLOAT_VECTOR,
descrition="speaker embeddings", description="speaker embeddings",
dim=VECTOR_DIMENSION, dim=VECTOR_DIMENSION,
is_primary=False) is_primary=False)
schema = CollectionSchema( schema = CollectionSchema(

@ -225,7 +225,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
websocket (WebSocket): the websocket instance websocket (WebSocket): the websocket instance
""" """
#1. the interface wait to accept the websocket protocal header #1. the interface wait to accept the websocket protocol header
# and only we receive the header, it establish the connection with specific thread # and only we receive the header, it establish the connection with specific thread
await websocket.accept() await websocket.accept()
@ -238,7 +238,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
connection_handler = None connection_handler = None
try: try:
#4. we do a loop to process the audio package by package according the protocal #4. we do a loop to process the audio package by package according the protocol
# and only if the client send finished signal, we will break the loop # and only if the client send finished signal, we will break the loop
while True: while True:
# careful here, changed the source code from starlette.websockets # careful here, changed the source code from starlette.websockets

@ -75,7 +75,7 @@ class TritonPythonModel:
def initialize(self, args): def initialize(self, args):
"""`initialize` is called only once when the model is being loaded. """`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows Implementing `initialize` function is optional. This function allows
the model to intialize any state associated with this model. the model to initialize any state associated with this model.
Parameters Parameters
---------- ----------
args : dict args : dict

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm) # "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf. # The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo". # To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
encoder_normalize_before: True # whether to perform layer normalization before the input encoder_normalize_before: True # whether to perform layer normalization before the input
@ -66,14 +66,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -39,7 +39,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks. use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv. bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
@ -77,7 +77,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer." bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization. use_spectral_norm: False # Whether to apply spectral normalization.

@ -45,7 +45,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -63,14 +63,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -43,7 +43,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -65,14 +65,14 @@ model:
cnn_decoder_embedding_dim: 256 cnn_decoder_embedding_dim: 256
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
encoder_normalize_before: True # whether to perform layer normalization before the input encoder_normalize_before: True # whether to perform layer normalization before the input
@ -66,14 +66,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks. use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv. bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer." bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization. use_spectral_norm: False # Whether to apply spectral normalization.

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks. use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv. bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer." bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization. use_spectral_norm: False # Whether to apply spectral normalization.

@ -42,7 +42,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks. use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv. bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
@ -83,7 +83,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer." bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization. use_spectral_norm: False # Whether to apply spectral normalization.

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm) # "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf. # The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo". # To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm) # "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf. # The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo". # To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm) # "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf. # The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo". # To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm) # "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf. # The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo". # To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm) # "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf. # The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo". # To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -34,7 +34,7 @@ model: # keyword arguments for the selected model
dunits: 1024 # number of decoder ff units dunits: 1024 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks. use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv. bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer." bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization. use_spectral_norm: False # Whether to apply spectral normalization.

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm) # "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf. # The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo". # To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -68,14 +68,14 @@ model:
duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks. use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv. bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer." bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization. use_spectral_norm: False # Whether to apply spectral normalization.

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks. use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv. bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer." bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization. use_spectral_norm: False # Whether to apply spectral normalization.

@ -97,7 +97,7 @@ def test_full_scores_words():
if w not in model: if w not in model:
print('"{0}" is an OOV'.format(w)) print('"{0}" is an OOV'.format(w))
oov.append(w) oov.append(w)
# zh_giga.no_cna_cmn.prune01244.klm is chinese charactor LM # zh_giga.no_cna_cmn.prune01244.klm is chinese character LM
assert oov == ["盘点", "不怕", "网站", "", "", "海淘", "向来", "便宜", "保真", assert oov == ["盘点", "不怕", "网站", "", "", "海淘", "向来", "便宜", "保真",
""], 'error oov' ""], 'error oov'

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm) # "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf. # The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo". # To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*" # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -39,7 +39,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks. use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv. bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type. nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
@ -77,7 +77,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer." bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation. nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters. nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1 negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization. use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization. use_spectral_norm: False # Whether to apply spectral normalization.

@ -32,8 +32,8 @@ def main(args, config):
seed_everything(config.seed) seed_everything(config.seed)
# stage 1: generate the voxceleb csv file # stage 1: generate the voxceleb csv file
# Note: this may occurs c++ execption, but the program will execute fine # Note: this may occurs c++ exception, but the program will execute fine
# so we ignore the execption # so we ignore the exception
# we explicitly pass the vox2 base path to data prepare and generate the audio info # we explicitly pass the vox2 base path to data prepare and generate the audio info
logger.info("start to generate the voxceleb dataset info") logger.info("start to generate the voxceleb dataset info")
train_dataset = VoxCeleb( train_dataset = VoxCeleb(

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors # assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0] # in sequences are same and fetching those from sequences[0]
max_size = paddle.shape(sequences[0]) max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start` # (TODO Hui Zhang): slice not support `end==start`
# trailing_dims = max_size[1:] # trailing_dims = max_size[1:]
trailing_dims = tuple( trailing_dims = tuple(
max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
@ -93,7 +93,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
length = tensor.shape[0] length = tensor.shape[0]
# use index notation to prevent duplicate references to the tensor # use index notation to prevent duplicate references to the tensor
if batch_first: if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support `end==start`
# TODO (Hui Zhang): set_value op not support int16 # TODO (Hui Zhang): set_value op not support int16
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor # out_tensor[i, :length, ...] = tensor
@ -102,7 +102,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
else: else:
out_tensor[i, length] = tensor out_tensor[i, length] = tensor
else: else:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support `end==start`
# out_tensor[:length, i, ...] = tensor # out_tensor[:length, i, ...] = tensor
if length != 0: if length != 0:
out_tensor[:length, i] = tensor out_tensor[:length, i] = tensor

@ -62,7 +62,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '': if line == '':
continue continue
audio_id, text = line.split(' ', 1) audio_id, text = line.split(' ', 1)
# remove withespace, charactor text # remove withespace, character text
text = ''.join(text.split()) text = ''.join(text.split())
transcript_dict[audio_id] = text transcript_dict[audio_id] = text

@ -65,7 +65,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '': if line == '':
continue continue
audio_id, text = line.split(' ', 1) audio_id, text = line.split(' ', 1)
# remove withespace, charactor text # remove withespace, character text
text = ''.join(text.split()) text = ''.join(text.split())
transcript_dict[audio_id] = text transcript_dict[audio_id] = text
@ -159,7 +159,7 @@ def check_dataset(data_dir):
if line == '': if line == '':
continue continue
audio_id, text = line.split(' ', 1) audio_id, text = line.split(' ', 1)
# remove withespace, charactor text # remove withespace, character text
text = ''.join(text.split()) text = ''.join(text.split())
transcript_dict[audio_id] = text transcript_dict[audio_id] = text

@ -362,7 +362,7 @@ class HubertASRTrainer(Trainer):
scratch = None scratch = None
if self.args.resume: if self.args.resume:
# just restore ckpt # just restore ckpt
# lr will resotre from optimizer ckpt # lr will restore from optimizer ckpt
resume_json_path = os.path.join(self.checkpoint_dir, resume_json_path = os.path.join(self.checkpoint_dir,
self.args.resume + '.json') self.args.resume + '.json')
with open(resume_json_path, 'r', encoding='utf8') as f: with open(resume_json_path, 'r', encoding='utf8') as f:
@ -370,20 +370,20 @@ class HubertASRTrainer(Trainer):
self.iteration = 0 self.iteration = 0
self.epoch = resume_json["epoch"] self.epoch = resume_json["epoch"]
# resotre model from *.pdparams # restore model from *.pdparams
params_path = os.path.join(self.checkpoint_dir, params_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdparams' "{}".format(self.epoch)) + '.pdparams'
model_dict = paddle.load(params_path) model_dict = paddle.load(params_path)
self.model.set_state_dict(model_dict) self.model.set_state_dict(model_dict)
# resotre optimizer from *.pdopt # restore optimizer from *.pdopt
optimizer_path = os.path.join(self.checkpoint_dir, optimizer_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdopt' "{}".format(self.epoch)) + '.pdopt'
optimizer_dict = paddle.load(optimizer_path) optimizer_dict = paddle.load(optimizer_path)
self.model_optimizer.set_state_dict(optimizer_dict['model']) self.model_optimizer.set_state_dict(optimizer_dict['model'])
self.hubert_optimizer.set_state_dict(optimizer_dict['hubert']) self.hubert_optimizer.set_state_dict(optimizer_dict['hubert'])
# resotre lr_scheduler from *.pdlrs # restore lr_scheduler from *.pdlrs
scheduler_path = os.path.join(self.checkpoint_dir, scheduler_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdlrs' "{}".format(self.epoch)) + '.pdlrs'
if os.path.isfile(os.path.join(scheduler_path)): if os.path.isfile(os.path.join(scheduler_path)):

@ -361,7 +361,7 @@ class Wav2Vec2ASRTrainer(Trainer):
scratch = None scratch = None
if self.args.resume: if self.args.resume:
# just restore ckpt # just restore ckpt
# lr will resotre from optimizer ckpt # lr will restore from optimizer ckpt
resume_json_path = os.path.join(self.checkpoint_dir, resume_json_path = os.path.join(self.checkpoint_dir,
self.args.resume + '.json') self.args.resume + '.json')
with open(resume_json_path, 'r', encoding='utf8') as f: with open(resume_json_path, 'r', encoding='utf8') as f:
@ -369,20 +369,20 @@ class Wav2Vec2ASRTrainer(Trainer):
self.iteration = 0 self.iteration = 0
self.epoch = resume_json["epoch"] self.epoch = resume_json["epoch"]
# resotre model from *.pdparams # restore model from *.pdparams
params_path = os.path.join(self.checkpoint_dir, params_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdparams' "{}".format(self.epoch)) + '.pdparams'
model_dict = paddle.load(params_path) model_dict = paddle.load(params_path)
self.model.set_state_dict(model_dict) self.model.set_state_dict(model_dict)
# resotre optimizer from *.pdopt # restore optimizer from *.pdopt
optimizer_path = os.path.join(self.checkpoint_dir, optimizer_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdopt' "{}".format(self.epoch)) + '.pdopt'
optimizer_dict = paddle.load(optimizer_path) optimizer_dict = paddle.load(optimizer_path)
self.model_optimizer.set_state_dict(optimizer_dict['model']) self.model_optimizer.set_state_dict(optimizer_dict['model'])
self.wav2vec2_optimizer.set_state_dict(optimizer_dict['wav2vec2']) self.wav2vec2_optimizer.set_state_dict(optimizer_dict['wav2vec2'])
# resotre lr_scheduler from *.pdlrs # restore lr_scheduler from *.pdlrs
scheduler_path = os.path.join(self.checkpoint_dir, scheduler_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdlrs' "{}".format(self.epoch)) + '.pdlrs'
if os.path.isfile(os.path.join(scheduler_path)): if os.path.isfile(os.path.join(scheduler_path)):

@ -361,7 +361,7 @@ class WavLMASRTrainer(Trainer):
scratch = None scratch = None
if self.args.resume: if self.args.resume:
# just restore ckpt # just restore ckpt
# lr will resotre from optimizer ckpt # lr will restore from optimizer ckpt
resume_json_path = os.path.join(self.checkpoint_dir, resume_json_path = os.path.join(self.checkpoint_dir,
self.args.resume + '.json') self.args.resume + '.json')
with open(resume_json_path, 'r', encoding='utf8') as f: with open(resume_json_path, 'r', encoding='utf8') as f:
@ -369,20 +369,20 @@ class WavLMASRTrainer(Trainer):
self.iteration = 0 self.iteration = 0
self.epoch = resume_json["epoch"] self.epoch = resume_json["epoch"]
# resotre model from *.pdparams # restore model from *.pdparams
params_path = os.path.join(self.checkpoint_dir, params_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdparams' "{}".format(self.epoch)) + '.pdparams'
model_dict = paddle.load(params_path) model_dict = paddle.load(params_path)
self.model.set_state_dict(model_dict) self.model.set_state_dict(model_dict)
# resotre optimizer from *.pdopt # restore optimizer from *.pdopt
optimizer_path = os.path.join(self.checkpoint_dir, optimizer_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdopt' "{}".format(self.epoch)) + '.pdopt'
optimizer_dict = paddle.load(optimizer_path) optimizer_dict = paddle.load(optimizer_path)
self.model_optimizer.set_state_dict(optimizer_dict['model']) self.model_optimizer.set_state_dict(optimizer_dict['model'])
self.wavlm_optimizer.set_state_dict(optimizer_dict['wavlm']) self.wavlm_optimizer.set_state_dict(optimizer_dict['wavlm'])
# resotre lr_scheduler from *.pdlrs # restore lr_scheduler from *.pdlrs
scheduler_path = os.path.join(self.checkpoint_dir, scheduler_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdlrs' "{}".format(self.epoch)) + '.pdlrs'
if os.path.isfile(os.path.join(scheduler_path)): if os.path.isfile(os.path.join(scheduler_path)):

@ -215,7 +215,7 @@ class Trainer():
checkpoint_path=self.args.checkpoint_path) checkpoint_path=self.args.checkpoint_path)
if infos: if infos:
# just restore ckpt # just restore ckpt
# lr will resotre from optimizer ckpt # lr will restore from optimizer ckpt
self.iteration = infos["step"] self.iteration = infos["step"]
self.epoch = infos["epoch"] self.epoch = infos["epoch"]

@ -171,7 +171,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
def cer(reference, hypothesis, ignore_case=False, remove_space=False): def cer(reference, hypothesis, ignore_case=False, remove_space=False):
"""Calculate charactor error rate (CER). CER compares reference text and """Calculate character error rate (CER). CER compares reference text and
hypothesis text in char-level. CER is defined as: hypothesis text in char-level. CER is defined as:
.. math:: .. math::

@ -80,7 +80,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors # assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0] # in sequences are same and fetching those from sequences[0]
max_size = paddle.shape(sequences[0]) max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start` # (TODO Hui Zhang): slice not support `end==start`
# trailing_dims = max_size[1:] # trailing_dims = max_size[1:]
trailing_dims = tuple( trailing_dims = tuple(
max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
@ -98,7 +98,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}" f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}"
) )
if batch_first: if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support `end==start`
# TODO (Hui Zhang): set_value op not support int16 # TODO (Hui Zhang): set_value op not support int16
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor # out_tensor[i, :length, ...] = tensor
@ -107,7 +107,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
else: else:
out_tensor[i, length] = tensor out_tensor[i, length] = tensor
else: else:
# TODO (Hui Zhang): set_value op not supprot `end==start` # TODO (Hui Zhang): set_value op not support `end==start`
# out_tensor[:length, i, ...] = tensor # out_tensor[:length, i, ...] = tensor
if length != 0: if length != 0:
out_tensor[:length, i] = tensor out_tensor[:length, i] = tensor

@ -79,7 +79,7 @@ class ASRWsAudioHandler:
punc_server_ip=None, punc_server_ip=None,
punc_server_port=None): punc_server_port=None):
"""PaddleSpeech Online ASR Server Client audio handler """PaddleSpeech Online ASR Server Client audio handler
Online asr server use the websocket protocal Online asr server use the websocket protocol
Args: Args:
url (str, optional): the server ip. Defaults to None. url (str, optional): the server ip. Defaults to None.
port (int, optional): the server port. Defaults to None. port (int, optional): the server port. Defaults to None.
@ -144,10 +144,10 @@ class ASRWsAudioHandler:
logger.error("No asr server, please input valid ip and port") logger.error("No asr server, please input valid ip and port")
return "" return ""
# 1. send websocket handshake protocal # 1. send websocket handshake protocol
start_time = time.time() start_time = time.time()
async with websockets.connect(self.url) as ws: async with websockets.connect(self.url) as ws:
# 2. server has already received handshake protocal # 2. server has already received handshake protocol
# client start to send the command # client start to send the command
audio_info = json.dumps( audio_info = json.dumps(
{ {
@ -255,7 +255,7 @@ class ASRHttpHandler:
class TTSWsHandler: class TTSWsHandler:
def __init__(self, server="127.0.0.1", port=8092, play: bool=False): def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
"""PaddleSpeech Online TTS Server Client audio handler """PaddleSpeech Online TTS Server Client audio handler
Online tts server use the websocket protocal Online tts server use the websocket protocol
Args: Args:
server (str, optional): the server ip. Defaults to "127.0.0.1". server (str, optional): the server ip. Defaults to "127.0.0.1".
port (int, optional): the server port. Defaults to 8092. port (int, optional): the server port. Defaults to 8092.
@ -405,7 +405,7 @@ class TTSWsHandler:
class TTSHttpHandler: class TTSHttpHandler:
def __init__(self, server="127.0.0.1", port=8092, play: bool=False): def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
"""PaddleSpeech Online TTS Server Client audio handler """PaddleSpeech Online TTS Server Client audio handler
Online tts server use the websocket protocal Online tts server use the websocket protocol
Args: Args:
server (str, optional): the server ip. Defaults to "127.0.0.1". server (str, optional): the server ip. Defaults to "127.0.0.1".
port (int, optional): the server port. Defaults to 8092. port (int, optional): the server port. Defaults to 8092.

@ -31,7 +31,7 @@ async def websocket_endpoint(websocket: WebSocket):
websocket (WebSocket): the websocket instance websocket (WebSocket): the websocket instance
""" """
#1. the interface wait to accept the websocket protocal header #1. the interface wait to accept the websocket protocol header
# and only we receive the header, it establish the connection with specific thread # and only we receive the header, it establish the connection with specific thread
await websocket.accept() await websocket.accept()
@ -45,7 +45,7 @@ async def websocket_endpoint(websocket: WebSocket):
connection_handler = None connection_handler = None
try: try:
#4. we do a loop to process the audio package by package according the protocal #4. we do a loop to process the audio package by package according the protocol
# and only if the client send finished signal, we will break the loop # and only if the client send finished signal, we will break the loop
while True: while True:
# careful here, changed the source code from starlette.websockets # careful here, changed the source code from starlette.websockets

@ -32,7 +32,7 @@ async def websocket_endpoint(websocket: WebSocket):
websocket (WebSocket): the websocket instance websocket (WebSocket): the websocket instance
""" """
#1. the interface wait to accept the websocket protocal header #1. the interface wait to accept the websocket protocol header
# and only we receive the header, it establish the connection with specific thread # and only we receive the header, it establish the connection with specific thread
await websocket.accept() await websocket.accept()

@ -523,7 +523,7 @@ class Frontend():
initials = [] initials = []
finals = [] finals = []
# to charactor list # to character list
words = self._split_word_to_char(words[0]) words = self._split_word_to_char(words[0])
for pinyin, char in zip(pinyin_spec, words): for pinyin, char in zip(pinyin_spec, words):

@ -76,7 +76,7 @@ class ResidualAffineCouplingBlock(nn.Layer):
use_weight_norm (bool): use_weight_norm (bool):
Whether to use weight normalization in WaveNet. Whether to use weight normalization in WaveNet.
bias (bool): bias (bool):
Whether to use bias paramters in WaveNet. Whether to use bias parameters in WaveNet.
use_only_mean (bool): use_only_mean (bool):
Whether to estimate only mean. Whether to estimate only mean.
@ -169,7 +169,7 @@ class ResidualAffineCouplingLayer(nn.Layer):
use_weight_norm (bool): use_weight_norm (bool):
Whether to use weight normalization in WaveNet. Whether to use weight normalization in WaveNet.
bias (bool): bias (bool):
Whether to use bias paramters in WaveNet. Whether to use bias parameters in WaveNet.
use_only_mean (bool): use_only_mean (bool):
Whether to estimate only mean. Whether to estimate only mean.

@ -159,7 +159,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
def cer(reference, hypothesis, ignore_case=False, remove_space=False): def cer(reference, hypothesis, ignore_case=False, remove_space=False):
"""Calculate charactor error rate (CER). CER compares reference text and """Calculate character error rate (CER). CER compares reference text and
hypothesis text in char-level. CER is defined as: hypothesis text in char-level. CER is defined as:
.. math:: .. math::
CER = (Sc + Dc + Ic) / Nc CER = (Sc + Dc + Ic) / Nc

@ -157,7 +157,7 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
next_score.v_b = prefix_score.ViterbiScore() + prob; next_score.v_b = prefix_score.ViterbiScore() + prob;
next_score.times_b = prefix_score.Times(); next_score.times_b = prefix_score.Times();
// Prefix not changed, copy the context from pefix // Prefix not changed, copy the context from prefix
if (context_graph_ && !next_score.has_context) { if (context_graph_ && !next_score.has_context) {
next_score.CopyContext(prefix_score); next_score.CopyContext(prefix_score);
next_score.has_context = true; next_score.has_context = true;
@ -183,7 +183,7 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
} }
} }
// Prefix not changed, copy the context from pefix // Prefix not changed, copy the context from prefix
if (context_graph_ && !next_score1.has_context) { if (context_graph_ && !next_score1.has_context) {
next_score1.CopyContext(prefix_score); next_score1.CopyContext(prefix_score);
next_score1.has_context = true; next_score1.has_context = true;

@ -72,7 +72,7 @@ bool CMVN::Read(std::vector<BaseFloat>* feats) {
return false; return false;
} }
// appply cmvn // apply cmvn
kaldi::Timer timer; kaldi::Timer timer;
Compute(feats); Compute(feats);
VLOG(1) << "CMVN::Read cost: " << timer.Elapsed() << " sec."; VLOG(1) << "CMVN::Read cost: " << timer.Elapsed() << " sec.";

@ -29,7 +29,7 @@ class CMVN : public FrontendInterface {
// the length of feats = feature_row * feature_dim, // the length of feats = feature_row * feature_dim,
// the Matrix is squashed into Vector // the Matrix is squashed into Vector
virtual bool Read(std::vector<kaldi::BaseFloat>* feats); virtual bool Read(std::vector<kaldi::BaseFloat>* feats);
// the dim_ is the feautre dim. // the dim_ is the feature dim.
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

@ -47,7 +47,7 @@ class DecibelNormalizer : public FrontendInterface {
std::unique_ptr<FrontendInterface> base_extractor); std::unique_ptr<FrontendInterface> base_extractor);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves); virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
// noramlize audio, the dim is 1. // normalize audio, the dim is 1.
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); } virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

@ -244,8 +244,8 @@ void MatrixBase<Real>::SymAddMat2(const Real alpha,
/// function will produce NaN in the output. This is a bug in the /// function will produce NaN in the output. This is a bug in the
/// ATLAS library. To overcome this, the AddMatMat function, which calls /// ATLAS library. To overcome this, the AddMatMat function, which calls
/// cblas_Xgemm(...) rather than cblas_Xsyrk(...), is used in this special /// cblas_Xgemm(...) rather than cblas_Xsyrk(...), is used in this special
/// sitation. /// situation.
/// Wei Shi: Note this bug is observerd for single precision matrix /// Wei Shi: Note this bug is observed for single precision matrix
/// on a 64-bit machine /// on a 64-bit machine
#ifdef HAVE_ATLAS #ifdef HAVE_ATLAS
if (transA == kTrans && num_rows_ >= 56) { if (transA == kTrans && num_rows_ >= 56) {
@ -683,7 +683,7 @@ empty.
if (V_in == NULL) tmpV.Resize(1, this->num_cols_); // work-space if V_in if (V_in == NULL) tmpV.Resize(1, this->num_cols_); // work-space if V_in
empty. empty.
/// Impementation notes: /// Implementation notes:
/// Lapack works in column-order, therefore the dimensions of *this are /// Lapack works in column-order, therefore the dimensions of *this are
/// swapped as well as the U and V matrices. /// swapped as well as the U and V matrices.
@ -2378,7 +2378,7 @@ bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
Matrix<Real> &M = *M_ptr; Matrix<Real> &M = *M_ptr;
HtkHeader htk_hdr; HtkHeader htk_hdr;
// TODO(arnab): this fails if the HTK file has CRC cheksum or is compressed. // TODO(arnab): this fails if the HTK file has CRC checksum or is compressed.
is.read((char*)&htk_hdr, sizeof(htk_hdr)); // we're being really POSIX here! is.read((char*)&htk_hdr, sizeof(htk_hdr)); // we're being really POSIX here!
if (is.fail()) { if (is.fail()) {
KALDI_WARN << "Could not read header from HTK feature file "; KALDI_WARN << "Could not read header from HTK feature file ";

@ -235,7 +235,7 @@ void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
memcpy(inc_data, mat.Data(), cols * rows * sizeof(Real)); memcpy(inc_data, mat.Data(), cols * rows * sizeof(Real));
} else { } else {
for (MatrixIndexT i = 0; i < rows; i++) { for (MatrixIndexT i = 0; i < rows; i++) {
// copy the data to the propper position // copy the data to the proper position
memcpy(inc_data, mat.RowData(i), cols * sizeof(Real)); memcpy(inc_data, mat.RowData(i), cols * sizeof(Real));
// set new copy position // set new copy position
inc_data += cols; inc_data += cols;

@ -44,7 +44,7 @@ std::string ReadFile2String(const std::string& path) {
} }
bool FileExists(const std::string& strFilename) { bool FileExists(const std::string& strFilename) {
// this funciton if from: // this function if from:
// https://github.com/kaldi-asr/kaldi/blob/master/src/fstext/deterministic-fst-test.cc // https://github.com/kaldi-asr/kaldi/blob/master/src/fstext/deterministic-fst-test.cc
struct stat stFileInfo; struct stat stFileInfo;
bool blnReturn; bool blnReturn;

@ -407,7 +407,7 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
if (os.fail()) if (os.fail())
KALDI_WARN << "Stream failure detected."; KALDI_WARN << "Stream failure detected.";
// Write another newline as a terminating character. The read routine will // Write another newline as a terminating character. The read routine will
// detect this [this is a Kaldi mechanism, not somethig in the original // detect this [this is a Kaldi mechanism, not something in the original
// OpenFst code]. // OpenFst code].
os << '\n'; os << '\n';
return os.good(); return os.good();

@ -34,7 +34,7 @@ bash run.sh --stop_stage 4
## Display Model with [Netron](https://github.com/lutzroeder/netron) ## Display Model with [Netron](https://github.com/lutzroeder/netron)
If you have a model, we can using this commnd to show model graph. If you have a model, we can using this commend to show model graph.
For example: For example:
``` ```

@ -74,7 +74,7 @@ includes/
#### set path #### set path
push resource into android phone push resource into android phone
1. change resource path in conf to gloabal path, such as: 1. change resource path in conf to global path, such as:
[CONF] [CONF]
wav_normal=true wav_normal=true
@ -92,9 +92,9 @@ push resource into android phone
high_freq=14000 high_freq=14000
dither=0.0 dither=0.0
2. adb push conf label_list scp test.wav /data/local/tmp/ 2. adb push conf label_list scp test.wav /data/local/tmp/
3. set reource path in android demo(android_demo/app/src/main/cpp/native-lib.cpp) to actual path, such as: 3. set resource path in android demo(android_demo/app/src/main/cpp/native-lib.cpp) to actual path, such as:
std::string conf_path = "/data/local/tmp/conf"; std::string conf_path = "/data/local/tmp/conf";
std::string wav_path = "/data/local/tmp/test.wav"; std::string wav_path = "/data/local/tmp/test.wav";
4. excecute android_demo in android studio 4. execute android_demo in android studio

@ -156,8 +156,8 @@ class Analysis:
return self.text[self.pos] return self.text[self.pos]
#判断该字符是否是中文字符(不包括中文标点) #判断该字符是否是中文字符(不包括中文标点)
def isChineseChar(self, charater): def isChineseChar(self, character):
return 0x4e00 <= ord(charater) < 0x9fa6 return 0x4e00 <= ord(character) < 0x9fa6
#判断是否是ASCII码 #判断是否是ASCII码
def isASCIIChar(self, ch): def isASCIIChar(self, ch):
@ -253,7 +253,6 @@ class Analysis:
# print(word3.length, word3.text) # print(word3.length, word3.text)
if word3.length == -1: if word3.length == -1:
chunk = Chunk(word1, word2) chunk = Chunk(word1, word2)
# print("Ture")
else: else:
chunk = Chunk(word1, word2, word3) chunk = Chunk(word1, word2, word3)
chunks.append(chunk) chunks.append(chunk)

@ -181,8 +181,8 @@ template <typename T>
class FlagRegisterer { class FlagRegisterer {
public: public:
FlagRegisterer(const string &name, const FlagDescription<T> &desc) { FlagRegisterer(const string &name, const FlagDescription<T> &desc) {
auto registr = FlagRegister<T>::GetRegister(); auto r = FlagRegister<T>::GetRegister();
registr->SetDescription(name, desc); r->SetDescription(name, desc);
} }
private: private:

@ -62,7 +62,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '': if line == '':
continue continue
audio_id, text = line.split(' ', 1) audio_id, text = line.split(' ', 1)
# remove withespace, charactor text # remove withespace, character text
text = ''.join(text.split()) text = ''.join(text.split())
transcript_dict[audio_id] = text transcript_dict[audio_id] = text

@ -63,7 +63,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '': if line == '':
continue continue
audio_id, text = line.split(' ', 1) audio_id, text = line.split(' ', 1)
# remove withespace, charactor text # remove withespace, character text
text = ''.join(text.split()) text = ''.join(text.split())
transcript_dict[audio_id] = text transcript_dict[audio_id] = text

@ -66,8 +66,8 @@ config_file=./conf/application.yaml
server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}') server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}') port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
echo "Sevice ip: $server_ip" | tee ./log/test_result.log echo "Service ip: $server_ip" | tee ./log/test_result.log
echo "Sevice port: $port" | tee -a ./log/test_result.log echo "Service port: $port" | tee -a ./log/test_result.log
# whether a process is listening on $port # whether a process is listening on $port
pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'` pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
@ -190,7 +190,7 @@ echo "**************************************************************************
echo "All tests completed." | tee -a ./log/test_result.log echo "All tests completed." | tee -a ./log/test_result.log
# sohw all the test results # show all the test results
echo "***************** Here are all the test results ********************" echo "***************** Here are all the test results ********************"
cat ./log/test_result.log cat ./log/test_result.log

@ -76,8 +76,8 @@ config_file=./conf/application.yaml
server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}') server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}') port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
echo "Sevice ip: $server_ip" | tee $log/test_result.log echo "Service ip: $server_ip" | tee $log/test_result.log
echo "Sevice port: $port" | tee -a $log/test_result.log echo "Service port: $port" | tee -a $log/test_result.log
# whether a process is listening on $port # whether a process is listening on $port
pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'` pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
@ -307,7 +307,7 @@ echo "**************************************************************************
echo "All tests completed." | tee -a $log/test_result.log echo "All tests completed." | tee -a $log/test_result.log
# sohw all the test results # show all the test results
echo "***************** Here are all the test results ********************" echo "***************** Here are all the test results ********************"
cat $log/test_result.log cat $log/test_result.log

@ -30,7 +30,7 @@ def _test_snapshot():
# use a simplest iterable object as dataloader # use a simplest iterable object as dataloader
dataloader = count() dataloader = count()
# hack the training proecss: training does nothing except increse iteration # hack the training proecss: training does nothing except increase iteration
updater = StandardUpdater(model, optimizer, dataloader=dataloader) updater = StandardUpdater(model, optimizer, dataloader=dataloader)
updater.update_core = lambda x: None updater.update_core = lambda x: None

@ -17,13 +17,13 @@ cd liblbfgs-$VER
./configure --prefix=`pwd` ./configure --prefix=`pwd`
make make
# due to the liblbfgs project directory structure, we have to use -i # due to the liblbfgs project directory structure, we have to use -i
# but the erros are completely harmless # but the errors are completely harmless
make -i install make -i install
cd .. cd ..
( (
[ ! -z "${LIBLBFGS}" ] && \ [ ! -z "${LIBLBFGS}" ] && \
echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \ echo >&2 "LIBLBFGS variable is already defined. Undefining..." && \
unset LIBLBFGS unset LIBLBFGS
[ -f ./env.sh ] && . ./env.sh [ -f ./env.sh ] && . ./env.sh

@ -68,7 +68,7 @@ make || exit
cd .. cd ..
( (
[ ! -z "${SRILM}" ] && \ [ ! -z "${SRILM}" ] && \
echo >&2 "SRILM variable is aleady defined. Undefining..." && \ echo >&2 "SRILM variable is already defined. Undefining..." && \
unset SRILM unset SRILM
[ -f ./env.sh ] && . ./env.sh [ -f ./env.sh ] && . ./env.sh

@ -44,7 +44,7 @@ add_arg('manifest_paths', str,
# bpe # bpe
add_arg('spm_model_prefix', str, None, add_arg('spm_model_prefix', str, None,
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm") "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
add_arg('output_path', str, None, "filepath of formated manifest.", required=True) add_arg('output_path', str, None, "filepath of formatted manifest.", required=True)
# yapf: disable # yapf: disable
args = parser.parse_args() args = parser.parse_args()

@ -32,7 +32,7 @@ def main(args):
# leaving `token` # leaving `token`
print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>')) print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
node += 1 node += 1
# Fianl node # Final node
print('0') print('0')

@ -21,7 +21,7 @@ cp -r $src_lang $tgt_lang
# eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0. # eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0.
# s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor. # s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor.
# G.fst, the disambiguation symbol #0 only appears on the input side # G.fst, the disambiguation symbol #0 only appears on the input side
# do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`. # do eps2disambig.pl and s2eps.pl maybe just for following `fstrmepsilon`.
cat $arpa_lm | \ cat $arpa_lm | \
grep -v '<s> <s>' | \ grep -v '<s> <s>' | \
grep -v '</s> <s>' | \ grep -v '</s> <s>' | \

@ -3,7 +3,7 @@
''' '''
Merge training configs into a single inference config. Merge training configs into a single inference config.
The single inference config is for CLI, which only takes a single config to do inferencing. The single inference config is for CLI, which only takes a single config to do inferencing.
The trainig configs includes: model config, preprocess config, decode config, vocab file and cmvn file. The training configs includes: model config, preprocess config, decode config, vocab file and cmvn file.
Process: Process:
# step 1: prepare dir # step 1: prepare dir
@ -11,7 +11,7 @@
cp -r exp conf data release_dir cp -r exp conf data release_dir
cd release_dir cd release_dir
# step 2: get "model.yaml" which conatains all configuration info. # step 2: get "model.yaml" which contains all configuration info.
# if does not contain preprocess.yaml file. e.g ds2: # if does not contain preprocess.yaml file. e.g ds2:
python generate_infer_yaml.py --cfg_pth conf/deepspeech2_online.yaml --dcd_pth conf/tuning/chunk_decode.yaml --vb_pth data/lang_char/vocab.txt --cmvn_pth data/mean_std.json --save_pth model.yaml --pre_pth null python generate_infer_yaml.py --cfg_pth conf/deepspeech2_online.yaml --dcd_pth conf/tuning/chunk_decode.yaml --vb_pth data/lang_char/vocab.txt --cmvn_pth data/mean_std.json --save_pth model.yaml --pre_pth null
# if contains preprocess.yaml file. e.g u2: # if contains preprocess.yaml file. e.g u2:

@ -79,7 +79,7 @@ if ($HELP)
print " -b ... disable Perl buffering.\n"; print " -b ... disable Perl buffering.\n";
print " -time ... enable processing time calculation.\n"; print " -time ... enable processing time calculation.\n";
print " -penn ... use Penn treebank-like tokenization.\n"; print " -penn ... use Penn treebank-like tokenization.\n";
print " -protected FILE ... specify file with patters to be protected in tokenisation.\n"; print " -protected FILE ... specify file with patterns to be protected in tokenisation.\n";
print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n"; print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
exit; exit;
} }

@ -37,7 +37,7 @@ fi
# the text should be properly pre-processed, e.g: # the text should be properly pre-processed, e.g:
# cleand, normalized and possibly word-segmented # cleand, normalized and possibly word-segmented
# get rid off irrelavent symbols # get rid off irrelevant symbols
grep -v '<eps>' $symbol_table \ grep -v '<eps>' $symbol_table \
| grep -v '#0' \ | grep -v '#0' \
| grep -v '<unk>' | grep -v '<UNK>' \ | grep -v '<unk>' | grep -v '<UNK>' \
@ -51,7 +51,7 @@ grep -v '<eps>' $symbol_table \
# #
# TL;DR reason: # TL;DR reason:
# Unlike SRILM's -limit-vocab, kenlm's --limit_vocab_file option # Unlike SRILM's -limit-vocab, kenlm's --limit_vocab_file option
# spcifies a *valid* set of vocabulary, whereas *valid but unseen* # specifies a *valid* set of vocabulary, whereas *valid but unseen*
# words are discarded in final arpa. # words are discarded in final arpa.
# So the trick is, # So the trick is,
# we explicitly add kaldi's vocab(one word per line) to training text, # we explicitly add kaldi's vocab(one word per line) to training text,

@ -1288,7 +1288,7 @@ def normalize_corpus(corpus,
def char_token(s: Text) -> List[Text]: def char_token(s: Text) -> List[Text]:
"""chinese charactor """chinese character
Args: Args:
s (Text): "我爱中国“ s (Text): "我爱中国“

Loading…
Cancel
Save