commit
a20ca46eac
@ -0,0 +1,41 @@
|
|||||||
|
# Prosody Prediction with CSMSC and AISHELL-3
|
||||||
|
|
||||||
|
## Get Started
|
||||||
|
### Data Preprocessing
|
||||||
|
```bash
|
||||||
|
./run.sh --stage 0 --stop-stage 0
|
||||||
|
```
|
||||||
|
### Model Training
|
||||||
|
```bash
|
||||||
|
./run.sh --stage 1 --stop-stage 1
|
||||||
|
```
|
||||||
|
### Testing
|
||||||
|
```bash
|
||||||
|
./run.sh --stage 2 --stop-stage 2
|
||||||
|
```
|
||||||
|
### Prosody Prediction
|
||||||
|
```bash
|
||||||
|
./run.sh --stage 3 --stop-stage 3
|
||||||
|
```
|
||||||
|
## Pretrained Model
|
||||||
|
The pretrained model can be downloaded here:
|
||||||
|
|
||||||
|
[ernie-1.0_aishellcsmsc_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/rhy_predict/ernie-1.0_aishellcsmsc_ckpt_1.3.0.zip)
|
||||||
|
|
||||||
|
And you should put it into `exp/${YOUREXP}/checkpoints` folder.
|
||||||
|
|
||||||
|
## Rhythm mapping
|
||||||
|
Four punctuation marks are used to denote the rhythm marks respectively:
|
||||||
|
|ryh_token|csmsc|aishll3|
|
||||||
|
|:---: |:---: |:---: |
|
||||||
|
|%|#1|%|
|
||||||
|
|`|#2||
|
||||||
|
|~|#3||
|
||||||
|
|$|#4|$|
|
||||||
|
|
||||||
|
## Prediction Results
|
||||||
|
| | #1 | #2 | #3 | #4 |
|
||||||
|
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
||||||
|
|Precision |0.90 |0.66 |0.91 |0.90|
|
||||||
|
|Recall |0.92 |0.62 |0.83 |0.85|
|
||||||
|
|F1 |0.91 |0.64 |0.87 |0.87|
|
@ -0,0 +1,44 @@
|
|||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
dataset_type: Ernie
|
||||||
|
train_path: data/train.txt
|
||||||
|
dev_path: data/dev.txt
|
||||||
|
test_path: data/test.txt
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 2
|
||||||
|
data_params:
|
||||||
|
pretrained_token: ernie-1.0
|
||||||
|
punc_path: data/rhy_token
|
||||||
|
seq_len: 100
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model_type: ErnieLinear
|
||||||
|
model:
|
||||||
|
pretrained_token: ernie-1.0
|
||||||
|
num_classes: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer_params:
|
||||||
|
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||||
|
|
||||||
|
scheduler_params:
|
||||||
|
learning_rate: 1.0e-5 # learning rate.
|
||||||
|
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 20
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_snapshots: 10 # max number of snapshots to keep while training
|
||||||
|
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,4 @@
|
|||||||
|
%
|
||||||
|
`
|
||||||
|
~
|
||||||
|
$
|
@ -0,0 +1,26 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ ! -f 000001-010000.txt ]; then
|
||||||
|
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/rhy_predict/000001-010000.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f label_train-set.txt ]; then
|
||||||
|
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/rhy_predict/label_train-set.txt
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
aishell_data=$1
|
||||||
|
csmsc_data=$2
|
||||||
|
processed_path=$3
|
||||||
|
|
||||||
|
python3 ./local/pre_for_sp_csmsc.py \
|
||||||
|
--data=${csmsc_data} \
|
||||||
|
--processed_path=${processed_path}
|
||||||
|
|
||||||
|
python3 ./local/pre_for_sp_aishell.py \
|
||||||
|
--data=${aishell_data} \
|
||||||
|
--processed_path=${processed_path}
|
||||||
|
|
||||||
|
|
||||||
|
echo "Finish data preparation."
|
||||||
|
exit 0
|
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
ckpt_name=$3
|
||||||
|
text=$4
|
||||||
|
ckpt_prefix=${ckpt_name%.*}
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/punc_restore.py \
|
||||||
|
--config=${config_path} \
|
||||||
|
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--text=${text}
|
@ -0,0 +1,13 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
ckpt_name=$3
|
||||||
|
print_eval=$4
|
||||||
|
|
||||||
|
ckpt_prefix=${ckpt_name%.*}
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/test.py \
|
||||||
|
--config=${config_path} \
|
||||||
|
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--print_eval=${print_eval}
|
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/train.py \
|
||||||
|
--config=${config_path} \
|
||||||
|
--output-dir=${train_output_path} \
|
||||||
|
--ngpu=1
|
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export MAIN_ROOT=${PWD}/../../../
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||||
|
|
||||||
|
MODEL=ernie_linear
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/paddlespeech/text/exps/${MODEL}
|
@ -0,0 +1,40 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
gpus=0
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
aishell_data=label_train-set.txt
|
||||||
|
csmsc_data=000001-010000.txt
|
||||||
|
processed_path=data
|
||||||
|
|
||||||
|
conf_path=conf/default.yaml
|
||||||
|
train_output_path=exp/default
|
||||||
|
ckpt_name=snapshot_iter_2600.pdz
|
||||||
|
text=我们城市的复苏有赖于他强有力的政策。
|
||||||
|
print_eval=false
|
||||||
|
|
||||||
|
# with the following command, you can choose the stage range you want to run
|
||||||
|
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||||
|
# this can not be mixed use with `$1`, `$2` ...
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# prepare data
|
||||||
|
./local/data.sh ${aishell_data} ${csmsc_data} ${processed_path}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${train_output_path} ${ckpt_name} ${print_eval} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/rhy_predict.sh ${conf_path} ${train_output_path} ${ckpt_name} ${text}|| exit -1
|
||||||
|
fi
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,58 @@
|
|||||||
|
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import paddle
|
||||||
|
import paddle.nn as nn
|
||||||
|
from paddle.autograd import PyLayer
|
||||||
|
|
||||||
|
|
||||||
|
class GradientReversalFunction(PyLayer):
|
||||||
|
"""Gradient Reversal Layer from:
|
||||||
|
Unsupervised Domain Adaptation by Backpropagation (Ganin & Lempitsky, 2015)
|
||||||
|
|
||||||
|
Forward pass is the identity function. In the backward pass,
|
||||||
|
the upstream gradients are multiplied by -lambda (i.e. gradient is reversed)
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def forward(ctx, x, lambda_=1):
|
||||||
|
"""Forward in networks
|
||||||
|
"""
|
||||||
|
ctx.save_for_backward(lambda_)
|
||||||
|
return x.clone()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def backward(ctx, grads):
|
||||||
|
"""Backward in networks
|
||||||
|
"""
|
||||||
|
lambda_, = ctx.saved_tensor()
|
||||||
|
dx = -lambda_ * grads
|
||||||
|
return paddle.clip(dx, min=-0.5, max=0.5)
|
||||||
|
|
||||||
|
|
||||||
|
class GradientReversalLayer(nn.Layer):
|
||||||
|
"""Gradient Reversal Layer from:
|
||||||
|
Unsupervised Domain Adaptation by Backpropagation (Ganin & Lempitsky, 2015)
|
||||||
|
|
||||||
|
Forward pass is the identity function. In the backward pass,
|
||||||
|
the upstream gradients are multiplied by -lambda (i.e. gradient is reversed)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, lambda_=1):
|
||||||
|
super(GradientReversalLayer, self).__init__()
|
||||||
|
self.lambda_ = lambda_
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Forward in networks
|
||||||
|
"""
|
||||||
|
return GradientReversalFunction.apply(x, self.lambda_)
|
@ -0,0 +1,55 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# Modified from Cross-Lingual-Voice-Cloning(https://github.com/deterministic-algorithms-lab/Cross-Lingual-Voice-Cloning)
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from typeguard import check_argument_types
|
||||||
|
|
||||||
|
|
||||||
|
class SpeakerClassifier(nn.Layer):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
idim: int,
|
||||||
|
hidden_sc_dim: int,
|
||||||
|
spk_num: int, ):
|
||||||
|
assert check_argument_types()
|
||||||
|
super().__init__()
|
||||||
|
# store hyperparameters
|
||||||
|
self.idim = idim
|
||||||
|
self.hidden_sc_dim = hidden_sc_dim
|
||||||
|
self.spk_num = spk_num
|
||||||
|
|
||||||
|
self.model = nn.Sequential(
|
||||||
|
nn.Linear(self.idim, self.hidden_sc_dim),
|
||||||
|
nn.Linear(self.hidden_sc_dim, self.spk_num))
|
||||||
|
|
||||||
|
def parse_outputs(self, out, text_lengths):
|
||||||
|
mask = paddle.arange(out.shape[1]).expand(
|
||||||
|
[out.shape[0], out.shape[1]]) < text_lengths.unsqueeze(1)
|
||||||
|
out = paddle.transpose(out, perm=[2, 0, 1])
|
||||||
|
out = out * mask
|
||||||
|
out = paddle.transpose(out, perm=[1, 2, 0])
|
||||||
|
return out
|
||||||
|
|
||||||
|
def forward(self, encoder_outputs, text_lengths):
|
||||||
|
"""
|
||||||
|
encoder_outputs = [batch_size, seq_len, encoder_embedding_size]
|
||||||
|
text_lengths = [batch_size]
|
||||||
|
|
||||||
|
log probabilities of speaker classification = [batch_size, seq_len, spk_num]
|
||||||
|
"""
|
||||||
|
|
||||||
|
out = self.model(encoder_outputs)
|
||||||
|
out = self.parse_outputs(out, text_lengths)
|
||||||
|
return out
|
@ -1,3 +1,2 @@
|
|||||||
data
|
data
|
||||||
utils
|
|
||||||
exp
|
exp
|
||||||
|
@ -0,0 +1,36 @@
|
|||||||
|
# aishell test
|
||||||
|
|
||||||
|
7176 utts, duration 36108.9 sec.
|
||||||
|
|
||||||
|
## Attention Rescore
|
||||||
|
|
||||||
|
### u2++ FP32
|
||||||
|
|
||||||
|
#### CER
|
||||||
|
|
||||||
|
```
|
||||||
|
Overall -> 5.75 % N=104765 C=99035 S=5587 D=143 I=294
|
||||||
|
Mandarin -> 5.75 % N=104762 C=99035 S=5584 D=143 I=294
|
||||||
|
English -> 0.00 % N=0 C=0 S=0 D=0 I=0
|
||||||
|
Other -> 100.00 % N=3 C=0 S=3 D=0 I=0
|
||||||
|
```
|
||||||
|
|
||||||
|
#### RTF
|
||||||
|
|
||||||
|
> RTF with feature and decoder which is more end to end.
|
||||||
|
|
||||||
|
* Intel(R) Xeon(R) Gold 6271C CPU @ 2.60GHz, support `avx512_vnni`
|
||||||
|
|
||||||
|
```
|
||||||
|
I1027 10:52:38.662868 51665 u2_recognizer_main.cc:122] total wav duration is: 36108.9 sec
|
||||||
|
I1027 10:52:38.662858 51665 u2_recognizer_main.cc:121] total cost:11169.1 sec
|
||||||
|
I1027 10:52:38.662876 51665 u2_recognizer_main.cc:123] RTF is: 0.309318
|
||||||
|
```
|
||||||
|
|
||||||
|
* Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, not support `avx512_vnni`
|
||||||
|
|
||||||
|
```
|
||||||
|
I1026 16:13:26.247121 48038 u2_recognizer_main.cc:123] total wav duration is: 36108.9 sec
|
||||||
|
I1026 16:13:26.247130 48038 u2_recognizer_main.cc:124] total decode cost:13656.7 sec
|
||||||
|
I1026 16:13:26.247138 48038 u2_recognizer_main.cc:125] RTF is: 0.378208
|
||||||
|
```
|
@ -0,0 +1,36 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
data=data
|
||||||
|
exp=exp
|
||||||
|
nj=20
|
||||||
|
|
||||||
|
. utils/parse_options.sh
|
||||||
|
|
||||||
|
mkdir -p $exp
|
||||||
|
ckpt_dir=./data/model
|
||||||
|
model_dir=$ckpt_dir/asr1_chunk_conformer_u2pp_wenetspeech_static_quant_1.3.0.model/
|
||||||
|
aishell_wav_scp=aishell_test.scp
|
||||||
|
text=$data/test/text
|
||||||
|
|
||||||
|
./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
|
||||||
|
|
||||||
|
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.quant.log \
|
||||||
|
u2_recognizer_main \
|
||||||
|
--use_fbank=true \
|
||||||
|
--num_bins=80 \
|
||||||
|
--cmvn_file=$exp/cmvn.ark \
|
||||||
|
--model_path=$model_dir/export \
|
||||||
|
--vocab_path=$model_dir/unit.txt \
|
||||||
|
--nnet_decoder_chunk=16 \
|
||||||
|
--receptive_field_length=7 \
|
||||||
|
--subsampling_rate=4 \
|
||||||
|
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
|
||||||
|
--result_wspecifier=ark,t:$data/split${nj}/JOB/recognizer.quant.rsl.ark
|
||||||
|
|
||||||
|
|
||||||
|
cat $data/split${nj}/*/recognizer.quant.rsl.ark > $exp/aishell.recognizer.quant.rsl
|
||||||
|
utils/compute-wer.py --char=1 --v=1 $text $exp/aishell.recognizer.quant.rsl > $exp/aishell.recognizer.quant.err
|
||||||
|
echo "recognizer quant test have finished!!!"
|
||||||
|
echo "please checkout in $exp/aishell.recognizer.quant.err"
|
||||||
|
tail -n 7 $exp/aishell.recognizer.quant.err
|
@ -0,0 +1 @@
|
|||||||
|
../../../../utils/
|
@ -0,0 +1 @@
|
|||||||
|
paddlepaddle>=2.4rc
|
@ -0,0 +1,20 @@
|
|||||||
|
# add watermark for text
|
||||||
|
def watermark(content, pattern):
|
||||||
|
m = list(zip(pattern * (len(content) // len(pattern) + 1), content))
|
||||||
|
return ''.join([x for t in m
|
||||||
|
for x in t] + [pattern[len(content) % len(pattern)]])
|
||||||
|
|
||||||
|
|
||||||
|
# remove cyclic watermark in text
|
||||||
|
def iwatermark(content):
|
||||||
|
e = [x for i, x in enumerate(content) if i % 2 == 0]
|
||||||
|
o = [x for i, x in enumerate(content) if i % 2 != 0]
|
||||||
|
for i in range(1, len(e) // 2 + 1):
|
||||||
|
if e[i:] == e[:-i]:
|
||||||
|
return ''.join(o)
|
||||||
|
return ''.join(e)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print(watermark('跟世龙对齐 Triton 开发计划', 'hbzs'))
|
||||||
|
print(iwatermark('h跟b世z龙s对h齐b zTsrhibtzosnh b开z发s计h划b'))
|
Loading…
Reference in new issue