Merge branch 'develop' into docker

pull/4003/head
liyulingyue 6 months ago
commit eaebcf409a

@ -233,7 +233,7 @@ def spectrogram(waveform: Tensor,
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it
snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
window_type (str, optional): Choose type of window for FFT computation. Defaults to "povey".
@ -443,7 +443,7 @@ def fbank(waveform: Tensor,
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it
snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.
@ -566,7 +566,7 @@ def mfcc(waveform: Tensor,
round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
to FFT. Defaults to True.
sr (int, optional): Sample rate of input waveform. Defaults to 16000.
snip_edges (bool, optional): Drop samples in the end of waveform that cann't fit a signal frame when it
snip_edges (bool, optional): Drop samples in the end of waveform that can't fit a signal frame when it
is set True. Otherwise performs reflect padding to the end of waveform. Defaults to True.
subtract_mean (bool, optional): Whether to subtract mean of feature files. Defaults to False.
use_energy (bool, optional): Add an dimension with energy of spectrogram to the output. Defaults to False.

@ -527,7 +527,7 @@ def melspectrogram(x: np.ndarray,
if fmax is None:
fmax = sr // 2
if fmin < 0 or fmin >= fmax:
raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
raise ParameterError('fmin and fmax must satisfy 0<fmin<fmax')
s = stft(
x,

@ -35,7 +35,7 @@ class ESC50(AudioClassificationDataset):
http://dx.doi.org/10.1145/2733373.2806390
"""
archieves = [
archives = [
{
'url':
'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
@ -133,7 +133,7 @@ class ESC50(AudioClassificationDataset):
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info()

@ -35,7 +35,7 @@ class GTZAN(AudioClassificationDataset):
https://ieeexplore.ieee.org/document/1021072/
"""
archieves = [
archives = [
{
'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
@ -85,7 +85,7 @@ class GTZAN(AudioClassificationDataset):
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info()
random.seed(seed) # shuffle samples to split data

@ -30,7 +30,7 @@ __all__ = ['OpenRIRNoise']
class OpenRIRNoise(Dataset):
archieves = [
archives = [
{
'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
'md5': 'e6f48e257286e05de56413b4779d8ffb',
@ -76,7 +76,7 @@ class OpenRIRNoise(Dataset):
print(f"rirs noises base path: {self.base_path}")
if not os.path.isdir(self.base_path):
download_and_decompress(
self.archieves, self.base_path, decompress=True)
self.archives, self.base_path, decompress=True)
else:
print(
f"{self.base_path} already exists, we will not download and decompress again"

@ -37,7 +37,7 @@ class TESS(AudioClassificationDataset):
https://doi.org/10.5683/SP2/E8H2MF
"""
archieves = [
archives = [
{
'url':
'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
@ -93,7 +93,7 @@ class TESS(AudioClassificationDataset):
def _get_data(self, mode, seed, n_folds,
split) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
download_and_decompress(self.archieves, DATA_HOME)
download_and_decompress(self.archives, DATA_HOME)
wav_files = []
for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):

@ -35,7 +35,7 @@ class UrbanSound8K(AudioClassificationDataset):
https://dl.acm.org/doi/10.1145/2647868.2655045
"""
archieves = [
archives = [
{
'url':
'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
@ -81,7 +81,7 @@ class UrbanSound8K(AudioClassificationDataset):
def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
download_and_decompress(self.archieves, DATA_HOME)
download_and_decompress(self.archives, DATA_HOME)
meta_info = self._get_meta_info()

@ -34,7 +34,7 @@ __all__ = ['VoxCeleb']
class VoxCeleb(Dataset):
source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
archieves_audio_dev = [
archives_audio_dev = [
{
'url': source_url + 'vox1_dev_wav_partaa',
'md5': 'e395d020928bc15670b570a21695ed96',
@ -52,13 +52,13 @@ class VoxCeleb(Dataset):
'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
},
]
archieves_audio_test = [
archives_audio_test = [
{
'url': source_url + 'vox1_test_wav.zip',
'md5': '185fdc63c3c739954633d50379a3d102',
},
]
archieves_meta = [
archives_meta = [
{
'url':
'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
@ -135,11 +135,11 @@ class VoxCeleb(Dataset):
if not os.path.isdir(self.wav_path):
print("start to download the voxceleb1 dataset")
download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip
self.archieves_audio_dev,
self.archives_audio_dev,
self.base_path,
decompress=False)
download_and_decompress( # download the vox1_test_wav.zip and unzip
self.archieves_audio_test,
self.archives_audio_test,
self.base_path,
decompress=True)
@ -157,7 +157,7 @@ class VoxCeleb(Dataset):
if not os.path.isdir(self.meta_path):
print("prepare the meta data")
download_and_decompress(
self.archieves_meta, self.meta_path, decompress=False)
self.archives_meta, self.meta_path, decompress=False)
# Data preparation.
if not os.path.isdir(self.csv_path):

@ -109,7 +109,7 @@ def create_manifest(data_dir, manifest_path):
def prepare_chime3(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create summmary manifest file."""
"""Download, unpack and create summary manifest file."""
if not os.path.exists(os.path.join(target_dir, "CHiME3")):
# download
filepath = download(url, md5sum, target_dir,

@ -210,7 +210,7 @@ def create_manifest(data_dir, manifest_path_prefix):
def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create summmary manifest file.
"""Download, unpack and create summary manifest file.
"""
filepath = os.path.join(target_dir, "TIMIT.zip")
if not os.path.exists(filepath):

@ -115,27 +115,27 @@ int FrontEngineInterface::init() {
// 生成词典(词到音素的映射)
if (0 != GenDict(_word2phone_path, &word_phone_map)) {
LOG(ERROR) << "Genarate word2phone dict failed";
LOG(ERROR) << "Generate word2phone dict failed";
return -1;
}
// 生成音素字典音素到音素id的映射
if (0 != GenDict(_phone2id_path, &phone_id_map)) {
LOG(ERROR) << "Genarate phone2id dict failed";
LOG(ERROR) << "Generate phone2id dict failed";
return -1;
}
// 生成音调字典音调到音调id的映射
if (_separate_tone == "true") {
if (0 != GenDict(_tone2id_path, &tone_id_map)) {
LOG(ERROR) << "Genarate tone2id dict failed";
LOG(ERROR) << "Generate tone2id dict failed";
return -1;
}
}
// 生成繁简字典繁体到简体id的映射
if (0 != GenDict(_trand2simp_path, &trand_simp_map)) {
LOG(ERROR) << "Genarate trand2simp dict failed";
LOG(ERROR) << "Generate trand2simp dict failed";
return -1;
}
@ -263,7 +263,7 @@ int FrontEngineInterface::GetWordsIds(
if (0 !=
GetInitialsFinals(word, &word_initials, &word_finals)) {
LOG(ERROR)
<< "Genarate the word_initials and word_finals of "
<< "Generate the word_initials and word_finals of "
<< word << " failed";
return -1;
}
@ -304,7 +304,7 @@ int FrontEngineInterface::GetWordsIds(
// 音素到音素id
if (0 != Phone2Phoneid(phone, phoneids, toneids)) {
LOG(ERROR) << "Genarate the phone id of " << word << " failed";
LOG(ERROR) << "Generate the phone id of " << word << " failed";
return -1;
}
}
@ -916,11 +916,11 @@ int FrontEngineInterface::NeuralSandhi(const std::string &word,
if (find(must_neural_tone_words.begin(),
must_neural_tone_words.end(),
word) != must_neural_tone_words.end() ||
(word_num >= 2 &&
find(must_neural_tone_words.begin(),
must_neural_tone_words.end(),
ppspeech::wstring2utf8string(word_wstr.substr(
word_num - 2))) != must_neural_tone_words.end())) {
(word_num >= 2 && find(must_neural_tone_words.begin(),
must_neural_tone_words.end(),
ppspeech::wstring2utf8string(
word_wstr.substr(word_num - 2))) !=
must_neural_tone_words.end())) {
(*finals).back() =
(*finals).back().replace((*finals).back().length() - 1, 1, "5");
}

@ -77,13 +77,13 @@ class MilvusHelper:
field1 = FieldSchema(
name="id",
dtype=DataType.INT64,
descrition="int64",
description="int64",
is_primary=True,
auto_id=True)
field2 = FieldSchema(
name="embedding",
dtype=DataType.FLOAT_VECTOR,
descrition="speaker embeddings",
description="speaker embeddings",
dim=VECTOR_DIMENSION,
is_primary=False)
schema = CollectionSchema(

@ -225,7 +225,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
websocket (WebSocket): the websocket instance
"""
#1. the interface wait to accept the websocket protocal header
#1. the interface wait to accept the websocket protocol header
# and only we receive the header, it establish the connection with specific thread
await websocket.accept()
@ -238,7 +238,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
connection_handler = None
try:
#4. we do a loop to process the audio package by package according the protocal
#4. we do a loop to process the audio package by package according the protocol
# and only if the client send finished signal, we will break the loop
while True:
# careful here, changed the source code from starlette.websockets

@ -75,7 +75,7 @@ class TritonPythonModel:
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to intialize any state associated with this model.
the model to initialize any state associated with this model.
Parameters
----------
args : dict

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
encoder_normalize_before: True # whether to perform layer normalization before the input
@ -66,14 +66,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -39,7 +39,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
@ -77,7 +77,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.

@ -45,7 +45,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -63,14 +63,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -43,7 +43,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -65,14 +65,14 @@ model:
cnn_decoder_embedding_dim: 256
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
encoder_normalize_before: True # whether to perform layer normalization before the input
@ -66,14 +66,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.

@ -42,7 +42,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
@ -83,7 +83,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -34,7 +34,7 @@ model: # keyword arguments for the selected model
dunits: 1024 # number of decoder ff units
positionwise_layer_type: conv1d # type of position-wise layer
positionwise_conv_kernel_size: 1 # kernel size of position wise conv layer
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -68,14 +68,14 @@ model:
duration_predictor_dropout_rate: 0.5 # dropout rate in energy predictor
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.

@ -38,7 +38,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
@ -76,7 +76,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.

@ -97,7 +97,7 @@ def test_full_scores_words():
if w not in model:
print('"{0}" is an OOV'.format(w))
oov.append(w)
# zh_giga.no_cna_cmn.prune01244.klm is chinese charactor LM
# zh_giga.no_cna_cmn.prune01244.klm is chinese character LM
assert oov == ["盘点", "不怕", "网站", "", "", "海淘", "向来", "便宜", "保真",
""], 'error oov'

@ -54,8 +54,8 @@ elif [ "${cmd_backend}" = sge ]; then
# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
# To know the "partion" names, type "sinfo".
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partition" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -39,7 +39,7 @@ generator_params:
use_additional_convs: True # Whether to use additional conv layer in residual blocks.
bias: True # Whether to use bias parameter in conv.
nonlinear_activation: "leakyrelu" # Nonlinear activation type.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
@ -77,7 +77,7 @@ discriminator_params:
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
bias: True # Whether to use bias parameter in conv layer."
nonlinear_activation: "leakyrelu" # Nonlinear activation.
nonlinear_activation_params: # Nonlinear activation paramters.
nonlinear_activation_params: # Nonlinear activation parameters.
negative_slope: 0.1
use_weight_norm: True # Whether to apply weight normalization.
use_spectral_norm: False # Whether to apply spectral normalization.

@ -32,8 +32,8 @@ def main(args, config):
seed_everything(config.seed)
# stage 1: generate the voxceleb csv file
# Note: this may occurs c++ execption, but the program will execute fine
# so we ignore the execption
# Note: this may occurs c++ exception, but the program will execute fine
# so we ignore the exception
# we explicitly pass the vox2 base path to data prepare and generate the audio info
logger.info("start to generate the voxceleb dataset info")
train_dataset = VoxCeleb(

@ -42,7 +42,7 @@ model:
duration_predictor_layers: 2 # number of layers of duration predictor
duration_predictor_chans: 256 # number of channels of duration predictor
duration_predictor_kernel_size: 3 # filter size of duration predictor
postnet_layers: 5 # number of layers of postnset
postnet_layers: 5 # number of layers of postnet
postnet_filts: 5 # filter size of conv layers in postnet
postnet_chans: 256 # number of channels of conv layers in postnet
use_scaled_pos_enc: True # whether to use scaled positional encoding
@ -60,14 +60,14 @@ model:
transformer_dec_attn_dropout_rate: 0.2 # dropout rate for transformer decoder attention layer
pitch_predictor_layers: 5 # number of conv layers in pitch predictor
pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
pitch_predictor_kernel_size: 5 # kernel size of conv layers in pitch predictor
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv layers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy

@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0]
max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start`
# (TODO Hui Zhang): slice not support `end==start`
# trailing_dims = max_size[1:]
trailing_dims = tuple(
max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
@ -93,7 +93,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
length = tensor.shape[0]
# use index notation to prevent duplicate references to the tensor
if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# TODO (Hui Zhang): set_value op not support `end==start`
# TODO (Hui Zhang): set_value op not support int16
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor
@ -102,7 +102,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
else:
out_tensor[i, length] = tensor
else:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# TODO (Hui Zhang): set_value op not support `end==start`
# out_tensor[:length, i, ...] = tensor
if length != 0:
out_tensor[:length, i] = tensor

@ -62,7 +62,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
# remove withespace, character text
text = ''.join(text.split())
transcript_dict[audio_id] = text

@ -65,7 +65,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
# remove withespace, character text
text = ''.join(text.split())
transcript_dict[audio_id] = text
@ -159,7 +159,7 @@ def check_dataset(data_dir):
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
# remove withespace, character text
text = ''.join(text.split())
transcript_dict[audio_id] = text

@ -362,7 +362,7 @@ class HubertASRTrainer(Trainer):
scratch = None
if self.args.resume:
# just restore ckpt
# lr will resotre from optimizer ckpt
# lr will restore from optimizer ckpt
resume_json_path = os.path.join(self.checkpoint_dir,
self.args.resume + '.json')
with open(resume_json_path, 'r', encoding='utf8') as f:
@ -370,20 +370,20 @@ class HubertASRTrainer(Trainer):
self.iteration = 0
self.epoch = resume_json["epoch"]
# resotre model from *.pdparams
# restore model from *.pdparams
params_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdparams'
model_dict = paddle.load(params_path)
self.model.set_state_dict(model_dict)
# resotre optimizer from *.pdopt
# restore optimizer from *.pdopt
optimizer_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdopt'
optimizer_dict = paddle.load(optimizer_path)
self.model_optimizer.set_state_dict(optimizer_dict['model'])
self.hubert_optimizer.set_state_dict(optimizer_dict['hubert'])
# resotre lr_scheduler from *.pdlrs
# restore lr_scheduler from *.pdlrs
scheduler_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdlrs'
if os.path.isfile(os.path.join(scheduler_path)):

@ -361,7 +361,7 @@ class Wav2Vec2ASRTrainer(Trainer):
scratch = None
if self.args.resume:
# just restore ckpt
# lr will resotre from optimizer ckpt
# lr will restore from optimizer ckpt
resume_json_path = os.path.join(self.checkpoint_dir,
self.args.resume + '.json')
with open(resume_json_path, 'r', encoding='utf8') as f:
@ -369,20 +369,20 @@ class Wav2Vec2ASRTrainer(Trainer):
self.iteration = 0
self.epoch = resume_json["epoch"]
# resotre model from *.pdparams
# restore model from *.pdparams
params_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdparams'
model_dict = paddle.load(params_path)
self.model.set_state_dict(model_dict)
# resotre optimizer from *.pdopt
# restore optimizer from *.pdopt
optimizer_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdopt'
optimizer_dict = paddle.load(optimizer_path)
self.model_optimizer.set_state_dict(optimizer_dict['model'])
self.wav2vec2_optimizer.set_state_dict(optimizer_dict['wav2vec2'])
# resotre lr_scheduler from *.pdlrs
# restore lr_scheduler from *.pdlrs
scheduler_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdlrs'
if os.path.isfile(os.path.join(scheduler_path)):

@ -361,7 +361,7 @@ class WavLMASRTrainer(Trainer):
scratch = None
if self.args.resume:
# just restore ckpt
# lr will resotre from optimizer ckpt
# lr will restore from optimizer ckpt
resume_json_path = os.path.join(self.checkpoint_dir,
self.args.resume + '.json')
with open(resume_json_path, 'r', encoding='utf8') as f:
@ -369,20 +369,20 @@ class WavLMASRTrainer(Trainer):
self.iteration = 0
self.epoch = resume_json["epoch"]
# resotre model from *.pdparams
# restore model from *.pdparams
params_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdparams'
model_dict = paddle.load(params_path)
self.model.set_state_dict(model_dict)
# resotre optimizer from *.pdopt
# restore optimizer from *.pdopt
optimizer_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdopt'
optimizer_dict = paddle.load(optimizer_path)
self.model_optimizer.set_state_dict(optimizer_dict['model'])
self.wavlm_optimizer.set_state_dict(optimizer_dict['wavlm'])
# resotre lr_scheduler from *.pdlrs
# restore lr_scheduler from *.pdlrs
scheduler_path = os.path.join(self.checkpoint_dir,
"{}".format(self.epoch)) + '.pdlrs'
if os.path.isfile(os.path.join(scheduler_path)):

@ -215,7 +215,7 @@ class Trainer():
checkpoint_path=self.args.checkpoint_path)
if infos:
# just restore ckpt
# lr will resotre from optimizer ckpt
# lr will restore from optimizer ckpt
self.iteration = infos["step"]
self.epoch = infos["epoch"]

@ -171,7 +171,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
def cer(reference, hypothesis, ignore_case=False, remove_space=False):
"""Calculate charactor error rate (CER). CER compares reference text and
"""Calculate character error rate (CER). CER compares reference text and
hypothesis text in char-level. CER is defined as:
.. math::

@ -80,7 +80,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
# assuming trailing dimensions and type of all the Tensors
# in sequences are same and fetching those from sequences[0]
max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start`
# (TODO Hui Zhang): slice not support `end==start`
# trailing_dims = max_size[1:]
trailing_dims = tuple(
max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
@ -98,7 +98,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
f"length {length}, out_tensor {out_tensor.shape}, tensor {tensor.shape}"
)
if batch_first:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# TODO (Hui Zhang): set_value op not support `end==start`
# TODO (Hui Zhang): set_value op not support int16
# TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
# out_tensor[i, :length, ...] = tensor
@ -107,7 +107,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
else:
out_tensor[i, length] = tensor
else:
# TODO (Hui Zhang): set_value op not supprot `end==start`
# TODO (Hui Zhang): set_value op not support `end==start`
# out_tensor[:length, i, ...] = tensor
if length != 0:
out_tensor[:length, i] = tensor

@ -79,7 +79,7 @@ class ASRWsAudioHandler:
punc_server_ip=None,
punc_server_port=None):
"""PaddleSpeech Online ASR Server Client audio handler
Online asr server use the websocket protocal
Online asr server use the websocket protocol
Args:
url (str, optional): the server ip. Defaults to None.
port (int, optional): the server port. Defaults to None.
@ -144,10 +144,10 @@ class ASRWsAudioHandler:
logger.error("No asr server, please input valid ip and port")
return ""
# 1. send websocket handshake protocal
# 1. send websocket handshake protocol
start_time = time.time()
async with websockets.connect(self.url) as ws:
# 2. server has already received handshake protocal
# 2. server has already received handshake protocol
# client start to send the command
audio_info = json.dumps(
{
@ -255,7 +255,7 @@ class ASRHttpHandler:
class TTSWsHandler:
def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
"""PaddleSpeech Online TTS Server Client audio handler
Online tts server use the websocket protocal
Online tts server use the websocket protocol
Args:
server (str, optional): the server ip. Defaults to "127.0.0.1".
port (int, optional): the server port. Defaults to 8092.
@ -405,7 +405,7 @@ class TTSWsHandler:
class TTSHttpHandler:
def __init__(self, server="127.0.0.1", port=8092, play: bool=False):
"""PaddleSpeech Online TTS Server Client audio handler
Online tts server use the websocket protocal
Online tts server use the websocket protocol
Args:
server (str, optional): the server ip. Defaults to "127.0.0.1".
port (int, optional): the server port. Defaults to 8092.

@ -31,7 +31,7 @@ async def websocket_endpoint(websocket: WebSocket):
websocket (WebSocket): the websocket instance
"""
#1. the interface wait to accept the websocket protocal header
#1. the interface wait to accept the websocket protocol header
# and only we receive the header, it establish the connection with specific thread
await websocket.accept()
@ -45,7 +45,7 @@ async def websocket_endpoint(websocket: WebSocket):
connection_handler = None
try:
#4. we do a loop to process the audio package by package according the protocal
#4. we do a loop to process the audio package by package according the protocol
# and only if the client send finished signal, we will break the loop
while True:
# careful here, changed the source code from starlette.websockets

@ -32,7 +32,7 @@ async def websocket_endpoint(websocket: WebSocket):
websocket (WebSocket): the websocket instance
"""
#1. the interface wait to accept the websocket protocal header
#1. the interface wait to accept the websocket protocol header
# and only we receive the header, it establish the connection with specific thread
await websocket.accept()

@ -523,7 +523,7 @@ class Frontend():
initials = []
finals = []
# to charactor list
# to character list
words = self._split_word_to_char(words[0])
for pinyin, char in zip(pinyin_spec, words):

@ -76,7 +76,7 @@ class ResidualAffineCouplingBlock(nn.Layer):
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
Whether to use bias parameters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.
@ -169,7 +169,7 @@ class ResidualAffineCouplingLayer(nn.Layer):
use_weight_norm (bool):
Whether to use weight normalization in WaveNet.
bias (bool):
Whether to use bias paramters in WaveNet.
Whether to use bias parameters in WaveNet.
use_only_mean (bool):
Whether to estimate only mean.

@ -159,7 +159,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
def cer(reference, hypothesis, ignore_case=False, remove_space=False):
"""Calculate charactor error rate (CER). CER compares reference text and
"""Calculate character error rate (CER). CER compares reference text and
hypothesis text in char-level. CER is defined as:
.. math::
CER = (Sc + Dc + Ic) / Nc

@ -157,7 +157,7 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
next_score.v_b = prefix_score.ViterbiScore() + prob;
next_score.times_b = prefix_score.Times();
// Prefix not changed, copy the context from pefix
// Prefix not changed, copy the context from prefix
if (context_graph_ && !next_score.has_context) {
next_score.CopyContext(prefix_score);
next_score.has_context = true;
@ -183,7 +183,7 @@ void CTCPrefixBeamSearch::AdvanceDecoding(
}
}
// Prefix not changed, copy the context from pefix
// Prefix not changed, copy the context from prefix
if (context_graph_ && !next_score1.has_context) {
next_score1.CopyContext(prefix_score);
next_score1.has_context = true;

@ -72,7 +72,7 @@ bool CMVN::Read(std::vector<BaseFloat>* feats) {
return false;
}
// appply cmvn
// apply cmvn
kaldi::Timer timer;
Compute(feats);
VLOG(1) << "CMVN::Read cost: " << timer.Elapsed() << " sec.";

@ -29,7 +29,7 @@ class CMVN : public FrontendInterface {
// the length of feats = feature_row * feature_dim,
// the Matrix is squashed into Vector
virtual bool Read(std::vector<kaldi::BaseFloat>* feats);
// the dim_ is the feautre dim.
// the dim_ is the feature dim.
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

@ -47,7 +47,7 @@ class DecibelNormalizer : public FrontendInterface {
std::unique_ptr<FrontendInterface> base_extractor);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
// noramlize audio, the dim is 1.
// normalize audio, the dim is 1.
virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { base_extractor_->SetFinished(); }
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

@ -244,8 +244,8 @@ void MatrixBase<Real>::SymAddMat2(const Real alpha,
/// function will produce NaN in the output. This is a bug in the
/// ATLAS library. To overcome this, the AddMatMat function, which calls
/// cblas_Xgemm(...) rather than cblas_Xsyrk(...), is used in this special
/// sitation.
/// Wei Shi: Note this bug is observerd for single precision matrix
/// situation.
/// Wei Shi: Note this bug is observed for single precision matrix
/// on a 64-bit machine
#ifdef HAVE_ATLAS
if (transA == kTrans && num_rows_ >= 56) {
@ -683,7 +683,7 @@ empty.
if (V_in == NULL) tmpV.Resize(1, this->num_cols_); // work-space if V_in
empty.
/// Impementation notes:
/// Implementation notes:
/// Lapack works in column-order, therefore the dimensions of *this are
/// swapped as well as the U and V matrices.
@ -2378,7 +2378,7 @@ bool ReadHtk(std::istream &is, Matrix<Real> *M_ptr, HtkHeader *header_ptr)
Matrix<Real> &M = *M_ptr;
HtkHeader htk_hdr;
// TODO(arnab): this fails if the HTK file has CRC cheksum or is compressed.
// TODO(arnab): this fails if the HTK file has CRC checksum or is compressed.
is.read((char*)&htk_hdr, sizeof(htk_hdr)); // we're being really POSIX here!
if (is.fail()) {
KALDI_WARN << "Could not read header from HTK feature file ";

@ -235,7 +235,7 @@ void VectorBase<Real>::CopyRowsFromMat(const MatrixBase<Real> &mat) {
memcpy(inc_data, mat.Data(), cols * rows * sizeof(Real));
} else {
for (MatrixIndexT i = 0; i < rows; i++) {
// copy the data to the propper position
// copy the data to the proper position
memcpy(inc_data, mat.RowData(i), cols * sizeof(Real));
// set new copy position
inc_data += cols;

@ -44,7 +44,7 @@ std::string ReadFile2String(const std::string& path) {
}
bool FileExists(const std::string& strFilename) {
// this funciton if from:
// this function if from:
// https://github.com/kaldi-asr/kaldi/blob/master/src/fstext/deterministic-fst-test.cc
struct stat stFileInfo;
bool blnReturn;

@ -407,7 +407,7 @@ bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
if (os.fail())
KALDI_WARN << "Stream failure detected.";
// Write another newline as a terminating character. The read routine will
// detect this [this is a Kaldi mechanism, not somethig in the original
// detect this [this is a Kaldi mechanism, not something in the original
// OpenFst code].
os << '\n';
return os.good();

@ -34,7 +34,7 @@ bash run.sh --stop_stage 4
## Display Model with [Netron](https://github.com/lutzroeder/netron)
If you have a model, we can using this commnd to show model graph.
If you have a model, we can using this commend to show model graph.
For example:
```

@ -74,7 +74,7 @@ includes/
#### set path
push resource into android phone
1. change resource path in conf to gloabal path, such as:
1. change resource path in conf to global path, such as:
[CONF]
wav_normal=true
@ -92,9 +92,9 @@ push resource into android phone
high_freq=14000
dither=0.0
2. adb push conf label_list scp test.wav /data/local/tmp/
3. set reource path in android demo(android_demo/app/src/main/cpp/native-lib.cpp) to actual path, such as:
3. set resource path in android demo(android_demo/app/src/main/cpp/native-lib.cpp) to actual path, such as:
std::string conf_path = "/data/local/tmp/conf";
std::string wav_path = "/data/local/tmp/test.wav";
4. excecute android_demo in android studio
4. execute android_demo in android studio

@ -156,8 +156,8 @@ class Analysis:
return self.text[self.pos]
#判断该字符是否是中文字符(不包括中文标点)
def isChineseChar(self, charater):
return 0x4e00 <= ord(charater) < 0x9fa6
def isChineseChar(self, character):
return 0x4e00 <= ord(character) < 0x9fa6
#判断是否是ASCII码
def isASCIIChar(self, ch):
@ -253,7 +253,6 @@ class Analysis:
# print(word3.length, word3.text)
if word3.length == -1:
chunk = Chunk(word1, word2)
# print("Ture")
else:
chunk = Chunk(word1, word2, word3)
chunks.append(chunk)

@ -181,8 +181,8 @@ template <typename T>
class FlagRegisterer {
public:
FlagRegisterer(const string &name, const FlagDescription<T> &desc) {
auto registr = FlagRegister<T>::GetRegister();
registr->SetDescription(name, desc);
auto r = FlagRegister<T>::GetRegister();
r->SetDescription(name, desc);
}
private:

@ -62,7 +62,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
# remove withespace, character text
text = ''.join(text.split())
transcript_dict[audio_id] = text

@ -63,7 +63,7 @@ def create_manifest(data_dir, manifest_path_prefix):
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
# remove withespace, character text
text = ''.join(text.split())
transcript_dict[audio_id] = text

@ -66,8 +66,8 @@ config_file=./conf/application.yaml
server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
echo "Sevice ip: $server_ip" | tee ./log/test_result.log
echo "Sevice port: $port" | tee -a ./log/test_result.log
echo "Service ip: $server_ip" | tee ./log/test_result.log
echo "Service port: $port" | tee -a ./log/test_result.log
# whether a process is listening on $port
pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
@ -190,7 +190,7 @@ echo "**************************************************************************
echo "All tests completed." | tee -a ./log/test_result.log
# sohw all the test results
# show all the test results
echo "***************** Here are all the test results ********************"
cat ./log/test_result.log

@ -76,8 +76,8 @@ config_file=./conf/application.yaml
server_ip=$(cat $config_file | grep "host" | awk -F " " '{print $2}')
port=$(cat $config_file | grep "port" | awk '/port:/ {print $2}')
echo "Sevice ip: $server_ip" | tee $log/test_result.log
echo "Sevice port: $port" | tee -a $log/test_result.log
echo "Service ip: $server_ip" | tee $log/test_result.log
echo "Service port: $port" | tee -a $log/test_result.log
# whether a process is listening on $port
pid=`lsof -i :"$port"|grep -v "PID" | awk '{print $2}'`
@ -307,7 +307,7 @@ echo "**************************************************************************
echo "All tests completed." | tee -a $log/test_result.log
# sohw all the test results
# show all the test results
echo "***************** Here are all the test results ********************"
cat $log/test_result.log

@ -30,7 +30,7 @@ def _test_snapshot():
# use a simplest iterable object as dataloader
dataloader = count()
# hack the training proecss: training does nothing except increse iteration
# hack the training proecss: training does nothing except increase iteration
updater = StandardUpdater(model, optimizer, dataloader=dataloader)
updater.update_core = lambda x: None

@ -17,13 +17,13 @@ cd liblbfgs-$VER
./configure --prefix=`pwd`
make
# due to the liblbfgs project directory structure, we have to use -i
# but the erros are completely harmless
# but the errors are completely harmless
make -i install
cd ..
(
[ ! -z "${LIBLBFGS}" ] && \
echo >&2 "LIBLBFGS variable is aleady defined. Undefining..." && \
echo >&2 "LIBLBFGS variable is already defined. Undefining..." && \
unset LIBLBFGS
[ -f ./env.sh ] && . ./env.sh

@ -68,7 +68,7 @@ make || exit
cd ..
(
[ ! -z "${SRILM}" ] && \
echo >&2 "SRILM variable is aleady defined. Undefining..." && \
echo >&2 "SRILM variable is already defined. Undefining..." && \
unset SRILM
[ -f ./env.sh ] && . ./env.sh

@ -44,7 +44,7 @@ add_arg('manifest_paths', str,
# bpe
add_arg('spm_model_prefix', str, None,
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
add_arg('output_path', str, None, "filepath of formated manifest.", required=True)
add_arg('output_path', str, None, "filepath of formatted manifest.", required=True)
# yapf: disable
args = parser.parse_args()

@ -32,7 +32,7 @@ def main(args):
# leaving `token`
print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
node += 1
# Fianl node
# Final node
print('0')

@ -21,7 +21,7 @@ cp -r $src_lang $tgt_lang
# eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0.
# s2eps.pl: replaces <s> and </s> with <eps> (on both input and output sides), for the G.fst acceptor.
# G.fst, the disambiguation symbol #0 only appears on the input side
# do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`.
# do eps2disambig.pl and s2eps.pl maybe just for following `fstrmepsilon`.
cat $arpa_lm | \
grep -v '<s> <s>' | \
grep -v '</s> <s>' | \

@ -3,7 +3,7 @@
'''
Merge training configs into a single inference config.
The single inference config is for CLI, which only takes a single config to do inferencing.
The trainig configs includes: model config, preprocess config, decode config, vocab file and cmvn file.
The training configs includes: model config, preprocess config, decode config, vocab file and cmvn file.
Process:
# step 1: prepare dir
@ -11,7 +11,7 @@
cp -r exp conf data release_dir
cd release_dir
# step 2: get "model.yaml" which conatains all configuration info.
# step 2: get "model.yaml" which contains all configuration info.
# if does not contain preprocess.yaml file. e.g ds2:
python generate_infer_yaml.py --cfg_pth conf/deepspeech2_online.yaml --dcd_pth conf/tuning/chunk_decode.yaml --vb_pth data/lang_char/vocab.txt --cmvn_pth data/mean_std.json --save_pth model.yaml --pre_pth null
# if contains preprocess.yaml file. e.g u2:

@ -79,7 +79,7 @@ if ($HELP)
print " -b ... disable Perl buffering.\n";
print " -time ... enable processing time calculation.\n";
print " -penn ... use Penn treebank-like tokenization.\n";
print " -protected FILE ... specify file with patters to be protected in tokenisation.\n";
print " -protected FILE ... specify file with patterns to be protected in tokenisation.\n";
print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
exit;
}

@ -37,7 +37,7 @@ fi
# the text should be properly pre-processed, e.g:
# cleand, normalized and possibly word-segmented
# get rid off irrelavent symbols
# get rid off irrelevant symbols
grep -v '<eps>' $symbol_table \
| grep -v '#0' \
| grep -v '<unk>' | grep -v '<UNK>' \
@ -51,7 +51,7 @@ grep -v '<eps>' $symbol_table \
#
# TL;DR reason:
# Unlike SRILM's -limit-vocab, kenlm's --limit_vocab_file option
# spcifies a *valid* set of vocabulary, whereas *valid but unseen*
# specifies a *valid* set of vocabulary, whereas *valid but unseen*
# words are discarded in final arpa.
# So the trick is,
# we explicitly add kaldi's vocab(one word per line) to training text,

@ -1288,7 +1288,7 @@ def normalize_corpus(corpus,
def char_token(s: Text) -> List[Text]:
"""chinese charactor
"""chinese character
Args:
s (Text): "我爱中国“

Loading…
Cancel
Save