parent
8e348d66b9
commit
4fe6c0dc99
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,79 @@
|
|||||||
|
# Punctuation Restoration with IWLST2012-Zh
|
||||||
|
|
||||||
|
## Get Started
|
||||||
|
### Data Preprocessing
|
||||||
|
```bash
|
||||||
|
./run.sh --stage 0 --stop-stage 0
|
||||||
|
```
|
||||||
|
### Model Training
|
||||||
|
```bash
|
||||||
|
./run.sh --stage 1 --stop-stage 1
|
||||||
|
```
|
||||||
|
### Testing
|
||||||
|
```bash
|
||||||
|
./run.sh --stage 2 --stop-stage 2
|
||||||
|
```
|
||||||
|
### Punctuation Restoration
|
||||||
|
```bash
|
||||||
|
./run.sh --stage 3 --stop-stage 3
|
||||||
|
```
|
||||||
|
## Pretrained Model
|
||||||
|
The pretrained model can be downloaded here:
|
||||||
|
|
||||||
|
[ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_iwslt2012_zh_ckpt_0.1.1.zip)
|
||||||
|
|
||||||
|
[ernie-3.0-base.tar.gz](https://paddlespeech.bj.bcebos.com/punc_restore/ernie-3.0-base.tar.gz)
|
||||||
|
|
||||||
|
[ernie-3.0-medium.tar.gz](https://paddlespeech.bj.bcebos.com/punc_restore/ernie-3.0-medium.tar.gz)
|
||||||
|
|
||||||
|
[ernie-3.0-micro.tar.gz](https://paddlespeech.bj.bcebos.com/punc_restore/ernie-3.0-micro.tar.gz)
|
||||||
|
|
||||||
|
[ernie-mini.tar.gz](https://paddlespeech.bj.bcebos.com/punc_restore/ernie-mini.tar.gz)
|
||||||
|
|
||||||
|
[ernie-nano.tar.gz](https://paddlespeech.bj.bcebos.com/punc_restore/ernie-nano.tar.gz)
|
||||||
|
|
||||||
|
[ernie-tiny.tar.gz](https://paddlespeech.bj.bcebos.com/punc_restore/ernie-tiny.tar.gz)
|
||||||
|
|
||||||
|
### Test Result
|
||||||
|
- Ernie 1.0
|
||||||
|
| |COMMA | PERIOD | QUESTION | OVERALL|
|
||||||
|
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
||||||
|
|Precision |0.510955 |0.526462 |0.820755 |0.619391|
|
||||||
|
|Recall |0.517433 |0.564179 |0.861386 |0.647666|
|
||||||
|
|F1 |0.514173 |0.544669 |0.840580 |0.633141|
|
||||||
|
- Ernie-tiny
|
||||||
|
| |COMMA | PERIOD | QUESTION | OVERALL|
|
||||||
|
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
||||||
|
|Precision |0.733177 |0.721448 |0.754717 |0.736447|
|
||||||
|
|Recall |0.380740 |0.524646 |0.733945 |0.546443|
|
||||||
|
|F1 |0.501204 |0.607506 |0.744186 |0.617632|
|
||||||
|
- Ernie-3.0-base-zh
|
||||||
|
| |COMMA | PERIOD | QUESTION | OVERALL|
|
||||||
|
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
||||||
|
|Precision |0.805947 |0.764160 |0.858491 |0.809532|
|
||||||
|
|Recall |0.399070 |0.567978 |0.850467 |0.605838|
|
||||||
|
|F1 |0.533817 |0.651623 |0.854460 |0.679967|
|
||||||
|
- Ernie-3.0-medium-zh
|
||||||
|
| |COMMA | PERIOD | QUESTION | OVERALL|
|
||||||
|
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
||||||
|
|Precision |0.730829 |0.699164 |0.707547 |0.712514|
|
||||||
|
|Recall |0.388196 |0.533286 |0.797872 |0.573118|
|
||||||
|
|F1 |0.507058 |0.605062 |0.750000 |0.620707|
|
||||||
|
- Ernie-3.0-mini-zh
|
||||||
|
| |COMMA | PERIOD | QUESTION | OVERALL|
|
||||||
|
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
||||||
|
|Precision |0.757433 |0.708449 |0.707547 |0.724477|
|
||||||
|
|Recall |0.355752 |0.506977 |0.735294 |0.532674|
|
||||||
|
|F1 |0.484121 |0.591015 |0.721154 |0.598763|
|
||||||
|
- Ernie-3.0-micro-zh
|
||||||
|
| |COMMA | PERIOD | QUESTION | OVERALL|
|
||||||
|
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
||||||
|
|Precision |0.733959 |0.679666 |0.726415 |0.713347|
|
||||||
|
|Recall |0.332742 |0.483487 |0.712963 |0.509731|
|
||||||
|
|F1 |0.457896 |0.565033 |0.719626 |0.580852|
|
||||||
|
- Ernie-3.0-nano-zh
|
||||||
|
| |COMMA | PERIOD | QUESTION | OVERALL|
|
||||||
|
|:-----:|:-----:|:-----:|:-----:|:-----:|
|
||||||
|
|Precision |0.693271 |0.682451 |0.754717 |0.710146|
|
||||||
|
|Recall |0.327784 |0.491968 |0.666667 |0.495473|
|
||||||
|
|F1 |0.445114 |0.571762 |0.707965 |0.574947|
|
@ -0,0 +1,44 @@
|
|||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
dataset_type: Ernie
|
||||||
|
train_path: data/rhy_predict/train.txt
|
||||||
|
dev_path: data/rhy_predict/dev.txt
|
||||||
|
test_path: data/rhy_predict/test.txt
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 2
|
||||||
|
data_params:
|
||||||
|
pretrained_token: ernie-1.0
|
||||||
|
punc_path: data/rhy_predict/rhy_token
|
||||||
|
seq_len: 100
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model_type: ErnieLinear
|
||||||
|
model:
|
||||||
|
pretrained_token: ernie-1.0
|
||||||
|
num_classes: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer_params:
|
||||||
|
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||||
|
|
||||||
|
scheduler_params:
|
||||||
|
learning_rate: 1.0e-5 # learning rate.
|
||||||
|
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 20
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_snapshots: 10 # max number of snapshots to keep while training
|
||||||
|
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,4 @@
|
|||||||
|
%
|
||||||
|
`
|
||||||
|
~
|
||||||
|
$
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
|
||||||
|
aishell_data=$1
|
||||||
|
biaobei_data=$2
|
||||||
|
processed_path=$3
|
||||||
|
|
||||||
|
python3 ./local/pre_for_sp_biaobei.py \
|
||||||
|
--data=${biaobei_data} \
|
||||||
|
--processed_path=${processed_path}
|
||||||
|
|
||||||
|
python3 ./local/pre_for_sp_aishell.py \
|
||||||
|
--data=${aishell_data} \
|
||||||
|
--processed_path=${processed_path}
|
||||||
|
|
||||||
|
|
||||||
|
echo "Finish data preparation."
|
||||||
|
exit 0
|
@ -0,0 +1,29 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def process_sentence(line):
|
||||||
|
if line == '':
|
||||||
|
return ''
|
||||||
|
res = line[0]
|
||||||
|
for i in range(1, len(line)):
|
||||||
|
res += (' ' + line[i])
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
paser = argparse.ArgumentParser(description="Input filename")
|
||||||
|
paser.add_argument('-input_file')
|
||||||
|
paser.add_argument('-output_file')
|
||||||
|
sentence_cnt = 0
|
||||||
|
args = paser.parse_args()
|
||||||
|
with open(args.input_file, 'r') as f:
|
||||||
|
with open(args.output_file, 'w') as write_f:
|
||||||
|
while True:
|
||||||
|
line = f.readline()
|
||||||
|
if line:
|
||||||
|
sentence_cnt += 1
|
||||||
|
write_f.write(process_sentence(line))
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
print('preprocess over')
|
||||||
|
print('total sentences number:', sentence_cnt)
|
@ -0,0 +1,12 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
ckpt_name=$3
|
||||||
|
text=$4
|
||||||
|
ckpt_prefix=${ckpt_name%.*}
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/punc_restore.py \
|
||||||
|
--config=${config_path} \
|
||||||
|
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||||
|
--text=${text}
|
@ -0,0 +1,11 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
ckpt_name=$3
|
||||||
|
|
||||||
|
ckpt_prefix=${ckpt_name%.*}
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/test.py \
|
||||||
|
--config=${config_path} \
|
||||||
|
--checkpoint=${train_output_path}/checkpoints/${ckpt_name}
|
@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
train_output_path=$2
|
||||||
|
|
||||||
|
python3 ${BIN_DIR}/train.py \
|
||||||
|
--config=${config_path} \
|
||||||
|
--output-dir=${train_output_path} \
|
||||||
|
--ngpu=1
|
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export MAIN_ROOT=${PWD}/../../../
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||||
|
|
||||||
|
MODEL=ernie_linear
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/paddlespeech/text/exps/${MODEL}
|
@ -0,0 +1,39 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
gpus=1
|
||||||
|
stage=3
|
||||||
|
stop_stage=3
|
||||||
|
|
||||||
|
aishell_data=label_train-set.txt
|
||||||
|
biaobei_data=000001-010000.txt
|
||||||
|
processed_path=data/rhy_predict
|
||||||
|
|
||||||
|
conf_path=conf/default.yaml
|
||||||
|
train_output_path=exp/rhy
|
||||||
|
ckpt_name=snapshot_iter_2600.pdz
|
||||||
|
text=我们城市的复苏有赖于他强有力的政策。
|
||||||
|
|
||||||
|
# with the following command, you can choose the stage range you want to run
|
||||||
|
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||||
|
# this can not be mixed use with `$1`, `$2` ...
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# prepare data
|
||||||
|
./local/data.sh ${aishell_data} ${biaobei_data} ${processed_path}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/rhy_predict.sh ${conf_path} ${train_output_path} ${ckpt_name} ${text}|| exit -1
|
||||||
|
fi
|
Loading…
Reference in new issue