feat: add ted_en_zh t1

4 years ago · 9f42ec4bc2
parent 84ea046386
commit 9f42ec4bc2
10 changed files with 567 additions and 0 deletions
--- a/examples/ted_en_zh/t1/.gitignore
+++ b/examples/ted_en_zh/t1/.gitignore
@ -0,0 +1,3 @@
+TED_EnZh
+data
+exp
--- a/examples/ted_en_zh/t1/README.md
+++ b/examples/ted_en_zh/t1/README.md
@ -0,0 +1,15 @@
+
+# TED En-Zh
+
+## Dataset
+
+| Data Subset | Duration in Seconds |
+| --- | --- |
+| data/manifest.train | 0.942 ~ 60   |
+| data/manifest.dev   | 1.151 ~ 39   |
+| data/manifest.test  | 1.1 ~ 42.746 |
+
+## Transformer
+| Model | Params | Config | Char-BLEU |
+| --- | --- | --- | --- |
+| Transformer+ASR MTL | 50.26M | conf/transformer_joint_noam.yaml | 17.38 |
--- a/examples/ted_en_zh/t1/conf/transformer.yaml
+++ b/examples/ted_en_zh/t1/conf/transformer.yaml
@ -0,0 +1,112 @@
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train.tiny
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 5.0  # frame
+  max_input_len: 3000.0 # frame
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.01
+  max_output_input_ratio: 20.0
+
+collator:
+  vocab_filepath: data/vocab.txt
+  unit_type: 'spm'
+  spm_model_prefix: data/bpe_unigram_8000
+  mean_std_filepath: ""
+  # augmentation_config: conf/augmentation.json
+  batch_size: 10
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 83
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+# network architecture
+model:
+    cmvn_file: None
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        asr_weight: 0.0
+        ctc_weight: 0.0
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 20
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.004
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 5
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+
+
+decoding:
+  batch_size: 5
+  error_rate_type: char-bleu
+  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  word_reward: 0.7
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/t1/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t1/conf/transformer_joint_noam.yaml
@ -0,0 +1,112 @@
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 5.0  # frame
+  max_input_len: 3000.0 # frame
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.01
+  max_output_input_ratio: 20.0
+
+collator:
+  vocab_filepath: data/vocab.txt
+  unit_type: 'spm'
+  spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc
+  mean_std_filepath: ""
+  # augmentation_config: conf/augmentation.json
+  batch_size: 10
+  raw_wav: True  # use raw_wav or kaldi feature
+  spectrum_type: fbank #linear, mfcc, fbank
+  feat_dim: 83
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+
+
+# network architecture
+model:
+    cmvn_file: None
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        asr_weight: 0.5
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
+training:
+  n_epoch: 20
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 2.5
+    weight_decay: 1e-06
+  scheduler: noam    
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 5
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+
+
+decoding:
+  batch_size: 5
+  error_rate_type: char-bleu
+  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  word_reward: 0.7
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/t1/local/convert_torch_to_paddle.py
+++ b/examples/ted_en_zh/t1/local/convert_torch_to_paddle.py
@ -0,0 +1,88 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import paddle
+import torch
+
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+def torch2paddle(args):
+    paddle.set_device('cpu')
+    paddle_model_dict = {}
+    torch_model = torch.load(args.torch_ckpt, map_location='cpu')
+    cnt = 0
+    for k, v in torch_model['model'].items():
+        if k.startswith('encoder.embed'):
+            if v.ndim == 2:
+                v = v.transpose(0, 1)
+            paddle_model_dict[k] = v.numpy()
+            cnt += 1
+            logger.info(
+                f"Convert torch weight: {k} to paddlepaddle weight: {k}, shape is {v.shape}"
+            )
+        if k.startswith('encoder.after_norm'):
+            paddle_model_dict[k] = v.numpy()
+            cnt += 1
+            paddle_model_dict[k.replace('en', 'de')] = v.numpy()
+            logger.info(
+                f"Convert torch weight: {k} to paddlepaddle weight: {k.replace('en','de')}, shape is {v.shape}"
+            )
+            paddle_model_dict['st_' + k.replace('en', 'de')] = v.numpy()
+            logger.info(
+                f"Convert torch weight: {k} to paddlepaddle weight: {'st_'+ k.replace('en','de')}, shape is {v.shape}"
+            )
+            cnt += 2
+        if k.startswith('encoder.encoders'):
+            if v.ndim == 2:
+                v = v.transpose(0, 1)
+            paddle_model_dict[k] = v.numpy()
+            logger.info(
+                f"Convert torch weight: {k} to paddlepaddle weight: {k}, shape is {v.shape}"
+            )
+            cnt += 1
+            origin_k = k
+            k_split = k.split('.')
+            if int(k_split[2]) >= 6:
+                k = k.replace(k_split[2], str(int(k_split[2]) - 6))
+                paddle_model_dict[k.replace('en', 'de')] = v.numpy()
+                logger.info(
+                    f"Convert torch weight: {origin_k} to paddlepaddle weight: {k.replace('en','de')}, shape is {v.shape}"
+                )
+                paddle_model_dict['st_' + k.replace('en', 'de')] = v.numpy()
+                logger.info(
+                    f"Convert torch weight: {origin_k} to paddlepaddle weight: {'st_'+ k.replace('en','de')}, shape is {v.shape}"
+                )
+                cnt += 2
+    logger.info(f"Convert {cnt} weights totally from torch to paddlepaddle")
+    paddle.save(paddle_model_dict, args.paddle_ckpt)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--torch_ckpt',
+        type=str,
+        default='/home/snapshot.ep.98',
+        help="Path to torch checkpoint.")
+    parser.add_argument(
+        '--paddle_ckpt',
+        type=str,
+        default='paddle.98.pdparams',
+        help="Path to save paddlepaddle checkpoint.")
+    args = parser.parse_args()
+    torch2paddle(args)
--- a/examples/ted_en_zh/t1/local/data.sh
+++ b/examples/ted_en_zh/t1/local/data.sh
@ -0,0 +1,110 @@
+#!/bin/bash
+
+set -e
+
+stage=-1
+stop_stage=100
+
+# bpemode (unigram or bpe)
+nbpe=8000
+bpemode=unigram
+bpeprefix="data/bpe_${bpemode}_${nbpe}"
+data_dir=./TED_EnZh
+
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+mkdir -p data
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    if [ ! -e ${data_dir} ]; then
+        echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+        echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
+        echo "The tree of the directory should be:"
+        echo "."
+        echo "|-- En-Zh"
+        echo "|-- test-segment"
+        echo "    |-- tst2010"
+        echo "    |-- ..."
+        echo "|-- train-split"
+        echo "    |-- train-segment"
+        echo "|-- README.md"
+
+        exit 1
+    fi
+
+    # generate manifests
+    python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
+    --manifest_prefix="data/manifest" \
+    --src_dir="${data_dir}"
+
+    echo "Complete raw data pre-process."
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --num_samples=-1 \
+    --spectrum_type="fbank" \
+    --feat_dim=80 \
+    --delta_delta=false \
+    --sample_rate=16000 \
+    --stride_ms=10.0 \
+    --window_ms=25.0 \
+    --use_dB_normalization=False \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type "spm" \
+    --spm_vocab_size=${nbpe} \
+    --spm_mode ${bpemode} \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --text_keys 'text' 'text1' \
+    --manifest_paths="data/manifest.train.raw"
+
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for set in train dev test; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
+        --feat_type "raw" \
+        --cmvn_path "data/mean_std.json" \
+        --unit_type "spm" \
+        --spm_model_prefix ${bpeprefix} \
+        --vocab_path="data/vocab.txt" \
+        --manifest_path="data/manifest.${set}.raw" \
+        --output_path="data/manifest.${set}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    }&
+    done
+    wait
+fi
+
+echo "Ted En-Zh Data preparation done."
+exit 0
--- a/examples/ted_en_zh/t1/local/test.sh
+++ b/examples/ted_en_zh/t1/local/test.sh
@ -0,0 +1,31 @@
+#! /usr/bin/env bash
+
+if [ $# != 2 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_prefix=$2
+
+for type in fullsentence; do
+    echo "decoding ${type}"
+    batch_size=32
+    python3 -u ${BIN_DIR}/test.py \
+    --nproc ${ngpu} \
+    --config ${config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} \
+    --opts decoding.batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+exit 0
--- a/examples/ted_en_zh/t1/local/train.sh
+++ b/examples/ted_en_zh/t1/local/train.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+
+if [ $# != 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+ckpt_path=$3
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=0
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+python3 -u ${BIN_DIR}/train.py \
+--nproc ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--checkpoint_path ${ckpt_path} \
+--seed ${seed}
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
--- a/examples/ted_en_zh/t1/path.sh
+++ b/examples/ted_en_zh/t1/path.sh
@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8 
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+
+MODEL=u2_st
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
--- a/examples/ted_en_zh/t1/run.sh
+++ b/examples/ted_en_zh/t1/run.sh
@ -0,0 +1,42 @@
+#!/bin/bash
+set -e
+source path.sh
+
+gpus=0,1,2,3
+stage=1
+stop_stage=100
+conf_path=conf/transformer_joint_noam.yaml
+ckpt_path=paddle.98
+avg_num=5
+data_path=./TED_EnZh # path to unzipped data
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh --data_dir ${data_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt} ${ckpt_path}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # export ckpt avg_n
+    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi