From e9798498d686e568d4d3488952f8cd2abec9a05f Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Mon, 29 Nov 2021 18:01:39 +0800
Subject: [PATCH 01/53] Update asr inference in paddlespeech.cli.

---
 paddlespeech/cli/executor.py                |  9 +--
 paddlespeech/cli/s2t/conf/default_conf.yaml |  0
 paddlespeech/cli/s2t/infer.py               | 67 +++++++++++++---
 paddlespeech/cli/utils.py                   | 86 ++++++++++++++++++---
 4 files changed, 136 insertions(+), 26 deletions(-)
 delete mode 100644 paddlespeech/cli/s2t/conf/default_conf.yaml

diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index 45472fa4b..2261e011b 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -14,7 +14,6 @@
 import os
 from abc import ABC
 from abc import abstractmethod
-from typing import Optional
 from typing import Union
 
 import paddle
@@ -30,16 +29,16 @@ class BaseExecutor(ABC):
         self.output = None
 
     @abstractmethod
-    def _get_default_cfg_path(self):
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
         """
-            Returns a default config file path of current task.
+            Download and returns pretrained resources path of current task.
         """
         pass
 
     @abstractmethod
-    def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
+    def _init_from_path(self, *args, **kwargs):
         """
-            Init model from a specific config file.
+            Init model and other resources from a specific path.
         """
         pass
 
diff --git a/paddlespeech/cli/s2t/conf/default_conf.yaml b/paddlespeech/cli/s2t/conf/default_conf.yaml
deleted file mode 100644
index e69de29bb..000000000
diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/s2t/infer.py
index 682279852..6aa29addf 100644
--- a/paddlespeech/cli/s2t/infer.py
+++ b/paddlespeech/cli/s2t/infer.py
@@ -21,9 +21,21 @@ import paddle
 
 from ..executor import BaseExecutor
 from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import logger
+from ..utils import MODEL_HOME
 
 __all__ = ['S2TExecutor']
 
+pretrained_models = {
+    "wenetspeech_zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz',
+        'md5':
+        '54e7a558a6e020c2f5fb224874943f97',
+    }
+}
+
 
 @cli_register(
     name='paddlespeech.s2t', description='Speech to text infer command.')
@@ -33,11 +45,23 @@ class S2TExecutor(BaseExecutor):
 
         self.parser = argparse.ArgumentParser(
             prog='paddlespeech.s2t', add_help=True)
+        self.parser.add_argument(
+            '--model',
+            type=str,
+            default='wenetspeech',
+            help='Choose model type of asr task.')
+        self.parser.add_argument(
+            '--lang', type=str, default='zh', help='Choose model language.')
         self.parser.add_argument(
             '--config',
             type=str,
             default=None,
             help='Config of s2t task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--ckpt_path',
+            type=str,
+            default=None,
+            help='Checkpoint file of model.')
         self.parser.add_argument(
             '--input', type=str, help='Audio file to recognize.')
         self.parser.add_argument(
@@ -46,16 +70,39 @@ class S2TExecutor(BaseExecutor):
             default='cpu',
             help='Choose device to execute model inference.')
 
-    def _get_default_cfg_path(self):
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
         """
-            Returns a default config file path of current task.
+            Download and returns pretrained resources path of current task.
         """
-        pass
+        assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format(
+            tag)
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+        return decompressed_path
 
-    def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
+    def _init_from_path(self,
+                        model_type: str='wenetspeech',
+                        lang: str='zh',
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None):
         """
-            Init model from a specific config file.
+            Init model and other resources from a specific path.
         """
+        if cfg_path is None or ckpt_path is None:
+            res_path = self._get_pretrained_path(
+                model_type + '_' + lang)  # wenetspeech_zh
+            cfg_path = os.path.join(res_path, 'conf/conformer.yaml')
+            ckpt_path = os.path.join(
+                res_path, 'exp/conformer/checkpoints/wenetspeech.pdparams')
+            logger.info(res_path)
+            logger.info(cfg_path)
+            logger.info(ckpt_path)
+
+        # Init body.
         pass
 
     def preprocess(self, input: Union[str, os.PathLike]):
@@ -82,17 +129,15 @@ class S2TExecutor(BaseExecutor):
         parser_args = self.parser.parse_args(argv)
         print(parser_args)
 
+        model = parser_args.model
+        lang = parser_args.lang
         config = parser_args.config
+        ckpt_path = parser_args.ckpt_path
         audio_file = parser_args.input
         device = parser_args.device
 
-        if config is not None:
-            assert os.path.isfile(config), 'Config file is not valid.'
-        else:
-            config = self._get_default_cfg_path()
-
         try:
-            self._init_from_cfg(config)
+            self._init_from_path(model, lang, config, ckpt_path)
             self.preprocess(audio_file)
             self.infer()
             res = self.postprocess()  # Retrieve result of s2t.
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index c83deee89..edf579f71 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import functools
+import logging
 import os
 from typing import Any
 from typing import Dict
-from typing import List
 
 from paddle.framework import load
 from paddle.utils import download
@@ -26,6 +27,7 @@ __all__ = [
     'get_command',
     'download_and_decompress',
     'load_state_dict_from_url',
+    'logger',
 ]
 
 
@@ -53,29 +55,27 @@ def get_command(name: str) -> Any:
     return com['_entry']
 
 
-def decompress(file: str):
+def decompress(file: str) -> os.PathLike:
     """
     Extracts all files from a compressed file.
     """
     assert os.path.isfile(file), "File: {} not exists.".format(file)
-    download._decompress(file)
+    return download._decompress(file)
 
 
-def download_and_decompress(archives: List[Dict[str, str]], path: str):
+def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike:
     """
     Download archieves and decompress to specific path.
     """
     if not os.path.isdir(path):
         os.makedirs(path)
 
-    for archive in archives:
-        assert 'url' in archive and 'md5' in archive, \
-            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+    assert 'url' in archive and 'md5' in archive, \
+        'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys()))
+    return download.get_path_from_url(archive['url'], path, archive['md5'])
 
-        download.get_path_from_url(archive['url'], path, archive['md5'])
 
-
-def load_state_dict_from_url(url: str, path: str, md5: str=None):
+def load_state_dict_from_url(url: str, path: str, md5: str=None) -> os.PathLike:
     """
     Download and load a state dict from url
     """
@@ -84,3 +84,69 @@ def load_state_dict_from_url(url: str, path: str, md5: str=None):
 
     download.get_path_from_url(url, path, md5)
     return load(os.path.join(path, os.path.basename(url)))
+
+
+def _get_user_home():
+    return os.path.expanduser('~')
+
+
+def _get_paddlespcceh_home():
+    if 'PPSPEECH_HOME' in os.environ:
+        home_path = os.environ['PPSPEECH_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    'The environment variable PPSPEECH_HOME {} is not a directory.'.
+                    format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddlespeech')
+
+
+def _get_sub_home(directory):
+    home = os.path.join(_get_paddlespcceh_home(), directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+
+
+PPSPEECH_HOME = _get_paddlespcceh_home()
+MODEL_HOME = _get_sub_home('models')
+
+
+class Logger(object):
+    def __init__(self, name: str=None):
+        name = 'PaddleSpeech' if not name else name
+        self.logger = logging.getLogger(name)
+
+        log_config = {
+            'DEBUG': 10,
+            'INFO': 20,
+            'TRAIN': 21,
+            'EVAL': 22,
+            'WARNING': 30,
+            'ERROR': 40,
+            'CRITICAL': 50
+        }
+        for key, level in log_config.items():
+            logging.addLevelName(level, key)
+            self.__dict__[key.lower()] = functools.partial(self.__call__, level)
+
+        self.format = logging.Formatter(
+            fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s'
+        )
+
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+
+        self.logger.addHandler(self.handler)
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+
+    def __call__(self, log_level: str, msg: str):
+        self.logger.log(log_level, msg)
+
+
+logger = Logger()

From 383b68d8f47f15c86ea1f9bdce90fe39d8ee3b58 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Thu, 25 Nov 2021 21:20:03 -0800
Subject: [PATCH 02/53] minor

---
 dataset/ted_en_zh/ted_en_zh.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataset/ted_en_zh/ted_en_zh.py b/dataset/ted_en_zh/ted_en_zh.py
index 9a3ba3b31..2d1fc6710 100644
--- a/dataset/ted_en_zh/ted_en_zh.py
+++ b/dataset/ted_en_zh/ted_en_zh.py
@@ -28,7 +28,7 @@ import soundfile
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
-    "--src_dir",
+    "--src-dir",
     default="",
     type=str,
     help="Directory to kaldi splited data. (default: %(default)s)")

From 6a50211c8042ce15392f37403edb54e59dd9a568 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Thu, 25 Nov 2021 21:20:37 -0800
Subject: [PATCH 03/53] data process for ted-en-zh st1

---
 examples/ted_en_zh/st1/local/data.sh | 214 +++++++++++++++++++--------
 examples/ted_en_zh/st1/path.sh       |  10 +-
 2 files changed, 161 insertions(+), 63 deletions(-)

diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh
index aa958cfde..72d141e7d 100755
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -2,16 +2,18 @@
 
 set -e
 
-stage=-1
+stage=1
 stop_stage=100
 dict_dir=data/lang_char
 
 # bpemode (unigram or bpe)
 nbpe=8000
-bpemode=unigram
+bpemode=bpe
 bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
 data_dir=./TED_EnZh
-
+target_dir=data/ted_en_zh
+dumpdir=data/dump
+do_delta=false
 
 source ${MAIN_ROOT}/utils/parse_options.sh
 
@@ -38,75 +40,163 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
         exit 1
     fi
 
-    # generate manifests
-    python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
-    --manifest_prefix="data/manifest" \
-    --src_dir="${data_dir}"
+    # # extract data 
+    # echo "data Extraction"
+    # python3 local/ted_en_zh.py \
+    # --tgt-dir=${target_dir} \
+    # --src-dir=${data_dir}
 
-    echo "Complete raw data pre-process."
 fi
-
+prep_dir=${target_dir}/data_prep 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # compute mean and stddev for normalizer
-    num_workers=$(nproc)
-    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
-    --manifest_path="data/manifest.train.raw" \
-    --num_samples=-1 \
-    --spectrum_type="fbank" \
-    --feat_dim=80 \
-    --delta_delta=false \
-    --sample_rate=16000 \
-    --stride_ms=10.0 \
-    --window_ms=25.0 \
-    --use_dB_normalization=False \
-    --num_workers=${num_workers} \
-    --output_path="data/mean_std.json"
-
-    if [ $? -ne 0 ]; then
-        echo "Compute mean and stddev failed. Terminated."
-        exit 1
-    fi
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: Data preparation"
+    for set in train dev test; do
+    # for set in train; do
+        dst=${target_dir}/${set}
+        for lang in en zh; do
+
+            if [ ${lang} = 'en' ]; then
+                echo "remove punctuation $lang"
+                # remove punctuation
+                local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw
+            else
+                cp ${dst}/${lang}.org ${dst}/${lang}.raw
+            fi
+
+            paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang}
+
+
+        done
+        # error check
+        n=$(cat ${dst}/.yaml | wc -l)
+        n_en=$(cat ${dst}/en.raw | wc -l)
+        n_tgt=$(cat ${dst}/zh.raw | wc -l)
+        [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
+        [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
+
+        echo "done text processing"
+        cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp
+        cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk
+
+        cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt
+        rm -rf ${prep_dir}/${set}.en-zh
+        mkdir -p ${prep_dir}/${set}.en-zh
+        echo "remove duplicate lines..."
+        cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \
+            | sed 's/^[ \t]*//' > ${dst}/duplicate_lines
+        cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \
+            | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
+        reduce_data_dir.sh ${dst} ${dst}/reclist ${prep_dir}/${set}.en-zh
+        echo "done wav processing"
+        for l in en zh; do
+            cp ${dst}/text.${l} ${prep_dir}/${set}.en-zh/text.${l}
+        done
+        utils/fix_data_dir.sh --utt_extra_files \
+        "text.en text.zh" \
+        ${prep_dir}/${set}.en-zh
+    done
 fi
 
+feat_tr_dir=${dumpdir}/train/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir}
+feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir}
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # build vocabulary
-    python3 ${MAIN_ROOT}/utils/build_vocab.py \
-    --unit_type "spm" \
-    --spm_vocab_size=${nbpe} \
-    --spm_mode ${bpemode} \
-    --spm_model_prefix ${bpeprefix} \
-    --vocab_path="${dict_dir}/vocab.txt" \
-    --text_keys 'text' 'text1' \
-    --manifest_paths="data/manifest.train.raw"
-
-
-    if [ $? -ne 0 ]; then
-        echo "Build vocabulary failed. Terminated."
-        exit 1
-    fi
+    ### Task dependent. You have to design training and dev sets by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 1: Feature Generation"
+    fbankdir=data/fbank
+    # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
+    for x in train dev test; do
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+            ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir}
+    done
+    
+    echo "speed perturbation"
+    utils/perturb_data_dir_speed.sh 0.9 ${prep_dir}/train.en-zh ${prep_dir}/temp1.en-zh
+    utils/perturb_data_dir_speed.sh 1.0 ${prep_dir}/train.en-zh ${prep_dir}/temp2.en-zh
+    utils/perturb_data_dir_speed.sh 1.1 ${prep_dir}/train.en-zh ${prep_dir}/temp3.en-zh
+
+    utils/combine_data.sh --extra-files utt2uniq ${prep_dir}/train_sp.en-zh \
+    ${prep_dir}/temp1.en-zh ${prep_dir}/temp2.en-zh ${prep_dir}/temp3.en-zh
+    rm -r ${prep_dir}/temp*.en-zh 
+    utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh
+
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+        ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir}
+
+    for lang in en zh; do
+        cat /dev/null > ${prep_dir}/train_sp.en-zh/text.${lang}
+        for p in "sp0.9-" "sp1.0-" "sp1.1-"; do
+            awk -v p=${p} '{printf("%s %s%s\n", $1, p, $1);}' ${prep_dir}/train.en-zh/utt2spk > ${prep_dir}/train_sp.en-zh/utt_map
+            utils/apply_map.pl -f 1 ${prep_dir}/train_sp.en-zh/utt_map < ${prep_dir}/train.en-zh/text.${lang} >>${prep_dir}/train_sp.en-zh/text.${lang}
+        done
+    done
+
+    for x in train_sp dev test; do
+        local/divide_lang.sh ${prep_dir}/${x}.en-zh zh
+    done
+
+    for x in train_sp dev; do
+        # remove utt having more than 3000 frames
+        # remove utt having more than 400 characters
+        for lang in zh en; do
+            remove_longshortdata.sh --maxframes 3000 --maxchars 400 ${prep_dir}/${x}.en-zh.${lang} ${prep_dir}/${x}.en-zh.${lang}.tmp
+        done
+        cut -f 1 -d " " ${prep_dir}/${x}.en-zh.en.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1
+        cut -f 1 -d " " ${prep_dir}/${x}.en-zh.${lang}.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2
+        comm -12 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 > ${prep_dir}/${x}.en-zh.en.tmp/reclist
+
+        for lang in zh en; do
+            reduce_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}.tmp ${prep_dir}/${x}.en-zh.en.tmp/reclist ${prep_dir}/${x}.en-zh.${lang}
+            utils/fix_data_dir.sh  ${prep_dir}/${x}.en-zh.${lang}
+        done
+        rm -rf ${prep_dir}/${x}.en-zh.*.tmp
+    done
+
+    compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark
+
+    dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
+        ${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+        ${prep_dir}/dev.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh ${feat_dt_dir}
+    dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
+        ${prep_dir}/test.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh ${feat_trans_dir}
 fi
 
+dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt
+nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt
+bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe}
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # format manifest with tokenids, vocab size
-    for set in train dev test; do
-    {
-        python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
-        --feat_type "raw" \
-        --cmvn_path "data/mean_std.json" \
-        --unit_type "spm" \
-        --spm_model_prefix ${bpeprefix} \
-        --vocab_path="${dict_dir}/vocab.txt" \
-        --manifest_path="data/manifest.${set}.raw" \
-        --output_path="data/manifest.${set}"
-
-        if [ $? -ne 0 ]; then
-            echo "Formt mnaifest failed. Terminated."
-            exit 1
-        fi
-    }&
+    echo "stage 2: Dictionary and Json Data Preparation"
+    # echo "make a non-linguistic symbol list for all languages"
+    # grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms}
+    # cat ${nlsyms}
+
+    echo "make a joint source and target dictionary"
+    echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
+    offset=$(wc -l < ${dict})
+    grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -v -e '^\s*$' > ${dict_dir}/input.txt
+    spm_train  --input=${dict_dir}/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0
+    spm_encode --model=${bpemodel}.model --output_format=piece < ${dict_dir}/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict}
+    wc -l ${dict}
+
+    echo "make json files"
+    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
+    data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+        ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json
+    echo "update json (add source references)"
+    # update json (add source references)
+    for x in ${train_set} ${train_dev}; do
+        feat_dir=${dumpdir}/${x}/delta${do_delta}
+        data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-zh.en
+        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+            ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict}
     done
-    wait
 fi
-
 echo "Ted En-Zh Data preparation done."
 exit 0
diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh
index fd537917a..ee4c9779f 100644
--- a/examples/ted_en_zh/st1/path.sh
+++ b/examples/ted_en_zh/st1/path.sh
@@ -1,6 +1,6 @@
 export MAIN_ROOT=`realpath ${PWD}/../../../`
 
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PWD}/utils:${PATH}
 export LC_ALL=C
 
 export PYTHONDONTWRITEBYTECODE=1
@@ -13,3 +13,11 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
 
 MODEL=u2_st
 export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
+
+# Kaldi
+export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
+export train_cmd="run.pl"
\ No newline at end of file

From cdd084512783303b9c606dc4c4e0aa739e6b8c3e Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Sun, 28 Nov 2021 22:59:37 -0800
Subject: [PATCH 04/53] add translate function

---
 paddlespeech/s2t/exps/u2_st/model.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index 52d3c3b7d..034463fea 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -26,8 +26,10 @@ from paddle import distributed as dist
 from paddle.io import DataLoader
 from yacs.config import CfgNode
 
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.io.collator import TripletSpeechCollator
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.io.dataset import ManifestDataset
 from paddlespeech.s2t.io.sampler import SortagradBatchSampler
 from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
@@ -423,6 +425,30 @@ class U2STTester(U2STTrainer):
             trans.append(''.join([chr(i) for i in ids]))
         return trans
 
+    def translate(self, audio, audio_len):
+        """"E2E translation from extracted audio feature"""
+        cfg = self.config.decoding
+        text_feature = self.test_loader.collate_fn.text_feature
+
+        hyps = self.model.decode(
+            audio,
+            audio_len,
+            text_feature=text_feature,
+            decoding_method=cfg.decoding_method,
+            lang_model_path=cfg.lang_model_path,
+            beam_alpha=cfg.alpha,
+            beam_beta=cfg.beta,
+            beam_size=cfg.beam_size,
+            cutoff_prob=cfg.cutoff_prob,
+            cutoff_top_n=cfg.cutoff_top_n,
+            num_processes=cfg.num_proc_bsearch,
+            ctc_weight=cfg.ctc_weight,
+            word_reward=cfg.word_reward,
+            decoding_chunk_size=cfg.decoding_chunk_size,
+            num_decoding_left_chunks=cfg.num_decoding_left_chunks,
+            simulate_streaming=cfg.simulate_streaming)
+        return hyps
+
     def compute_translation_metrics(self,
                                     utts,
                                     audio,

From 8f3280af8e73c90b148a94800948e4dc7273696a Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 11:30:18 -0800
Subject: [PATCH 05/53] fix data process

---
 examples/ted_en_zh/st1/local/data.sh          | 53 +++++++++---------
 examples/ted_en_zh/st1/local/data_prep.sh     | 54 +++++++++++++++++++
 examples/ted_en_zh/st1/local/divide_lang.sh   | 48 +++++++++++++++++
 .../st1/local/espnet_json_to_manifest.py      | 27 ++++++++++
 .../ted_en_zh/st1/local/remove_punctuation.pl | 25 +++++++++
 5 files changed, 183 insertions(+), 24 deletions(-)
 create mode 100755 examples/ted_en_zh/st1/local/data_prep.sh
 create mode 100755 examples/ted_en_zh/st1/local/divide_lang.sh
 create mode 100644 examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
 create mode 100755 examples/ted_en_zh/st1/local/remove_punctuation.pl

diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh
index 72d141e7d..8b829a8a1 100755
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -2,7 +2,7 @@
 
 set -e
 
-stage=1
+stage=3
 stop_stage=100
 dict_dir=data/lang_char
 
@@ -14,6 +14,7 @@ data_dir=./TED_EnZh
 target_dir=data/ted_en_zh
 dumpdir=data/dump
 do_delta=false
+nj=20
 
 source ${MAIN_ROOT}/utils/parse_options.sh
 
@@ -40,11 +41,11 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
         exit 1
     fi
 
-    # # extract data 
-    # echo "data Extraction"
-    # python3 local/ted_en_zh.py \
-    # --tgt-dir=${target_dir} \
-    # --src-dir=${data_dir}
+    # extract data 
+    echo "data Extraction"
+    python3 local/ted_en_zh.py \
+    --tgt-dir=${target_dir} \
+    --src-dir=${data_dir}
 
 fi
 prep_dir=${target_dir}/data_prep 
@@ -99,7 +100,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     done
 fi
 
-feat_tr_dir=${dumpdir}/train/delta${do_delta}; mkdir -p ${feat_tr_dir}
+feat_tr_dir=${dumpdir}/train_sp/delta${do_delta}; mkdir -p ${feat_tr_dir}
 feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir}
 feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir}
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -109,7 +110,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     fbankdir=data/fbank
     # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame
     for x in train dev test; do
-        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+        steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
             ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir}
     done
     
@@ -123,7 +124,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     rm -r ${prep_dir}/temp*.en-zh 
     utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh
 
-    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \
+    steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \
         ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir}
 
     for lang in en zh; do
@@ -155,14 +156,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         rm -rf ${prep_dir}/${x}.en-zh.*.tmp
     done
 
-    compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark
+    compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark
 
-    dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \
-        ${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh ${feat_tr_dir}
-    dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
-        ${prep_dir}/dev.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh ${feat_dt_dir}
-    dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \
-        ${prep_dir}/test.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh ${feat_trans_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh.zh ${feat_tr_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/dev.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh.zh ${feat_dt_dir}
+    dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \
+        ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir}
 fi
 
 dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt
@@ -170,9 +171,6 @@ nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt
 bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe}
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary and Json Data Preparation"
-    # echo "make a non-linguistic symbol list for all languages"
-    # grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms}
-    # cat ${nlsyms}
 
     echo "make a joint source and target dictionary"
     echo "<unk> 1" > ${dict} # <unk> must be 1, 0 will be used for "blank" in CTC
@@ -183,20 +181,27 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     wc -l ${dict}
 
     echo "make json files"
-    data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+    data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
         ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json
     data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
         ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json
-    data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
+    data2json.sh --feat ${feat_trans_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \
         ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json
     echo "update json (add source references)"
     # update json (add source references)
-    for x in ${train_set} ${train_dev}; do
+    for x in train_sp dev; do
         feat_dir=${dumpdir}/${x}/delta${do_delta}
-        data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-zh.en
-        update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \
+        data_dir=${prep_dir}/$(echo ${x} | cut -f 1 -d ".").en-zh.en
+        update_json.sh --text ${data_dir}/text --bpecode ${bpemodel}.model \
             ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict}
     done
 fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Format the Json Data"
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev
+    python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test
+fi
 echo "Ted En-Zh Data preparation done."
 exit 0
diff --git a/examples/ted_en_zh/st1/local/data_prep.sh b/examples/ted_en_zh/st1/local/data_prep.sh
new file mode 100755
index 000000000..339cee1eb
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/data_prep.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+export LC_ALL=C
+
+data_dir=${1}
+
+for set in train dev test; do
+# for set in train; do
+    dst=${target_dir}/${set}
+    for lang in en zh; do
+
+        if [ ${lang} = 'en' ]; then
+            echo "remove punctuation $lang"
+            # remove punctuation
+            local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw
+        else
+            cp ${dst}/${lang}.org ${dst}/${lang}.raw
+        fi
+
+        paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang}
+
+
+    done
+    # error check
+    n=$(cat ${dst}/.yaml | wc -l)
+    n_en=$(cat ${dst}/en.raw | wc -l)
+    n_tgt=$(cat ${dst}/zh.raw | wc -l)
+    [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
+    [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
+
+    echo "done text processing"
+    cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp
+    cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk
+
+    cat ${dst}/utt2spk | utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt
+    rm -rf ${target_dir}/data_prep/${set}.en-zh
+    mkdir -p ${target_dir}/data_prep/${set}.en-zh
+    echo "remove duplicate lines..."
+    cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \
+        | sed 's/^[ \t]*//' > ${dst}/duplicate_lines
+    cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \
+        | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
+    reduce_data_dir.sh ${dst} ${dst}/reclist ${target_dir}/data_prep/${set}.en-zh
+    echo "done wav processing"
+    for l in en zh; do
+        cp ${dst}/text.${l} ${target_dir}/data_prep/${set}.en-zh/text.${l}
+    done
+    fix_data_dir.sh --utt_extra_files \
+    "text.en text.zh" \
+    ${target_dir}/data_prep/${set}.en-zh
+done      
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/local/divide_lang.sh b/examples/ted_en_zh/st1/local/divide_lang.sh
new file mode 100755
index 000000000..4e5f85c86
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/divide_lang.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#           2021 PaddlePaddle
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+. ./path.sh
+
+if [ "$#" -ne 2 ]; then
+    echo "Usage: $0 <set> <lang>>"
+    echo "e.g.: $0 dev"
+    exit 1
+fi
+
+set=$1
+lang=$2
+export LC_ALL=en_US.UTF-8
+# Copy stuff intoc its final locations [this has been moved from the format_data script]
+# for En
+mkdir -p ${set}.en
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f ${set}/${f} ]; then
+        sort ${set}/${f} > ${set}.en/${f}
+    fi
+done
+sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text 
+
+utils/fix_data_dir.sh ${set}.en
+if [ -f ${set}.en/feats.scp ]; then
+    utils/validate_data_dir.sh ${set}.en || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1;
+fi
+
+# for target language
+mkdir -p ${set}.${lang}
+for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
+    if [ -f ${set}/${f} ]; then
+        sort ${set}/${f} > ${set}.${lang}/${f}
+    fi
+done
+sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text 
+utils/fix_data_dir.sh  ${set}.${lang}
+if [ -f ${set}.${lang}/feats.scp ]; then
+    utils/validate_data_dir.sh ${set}.${lang} || exit 1;
+else
+    utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1;
+fi
diff --git a/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
new file mode 100644
index 000000000..60d254367
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+import argparse
+import json
+
+
+def main(args):
+    with open(args.json_file, 'r') as fin:
+        data_json = json.load(fin)
+
+    with open(args.manifest_file, 'w') as fout:
+        for key, value in data_json['utts'].items():
+            value['utt'] = key
+            fout.write(json.dumps(value, ensure_ascii=False))
+            fout.write("\n")
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--json-file', type=str, default=None, help="espnet data json file.")
+    parser.add_argument(
+        '--manifest-file',
+        type=str,
+        default='manifest.train',
+        help='manifest data json line file.')
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/ted_en_zh/st1/local/remove_punctuation.pl b/examples/ted_en_zh/st1/local/remove_punctuation.pl
new file mode 100755
index 000000000..89e19c6f4
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/remove_punctuation.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+while(<STDIN>) {
+  $_ = " $_ ";
+
+  # remove punctuation except apostrophe
+  s/<space>/spacemark/g;  # for scoring
+  s/'/apostrophe/g;
+  s/[[:punct:]]//g;
+  s/apostrophe/'/g;
+  s/spacemark/<space>/g;  # for scoring
+
+  # remove whitespace
+  s/\s+/ /g;
+  s/^\s+//;
+  s/\s+$//;
+
+  print "$_\n";
+}

From ea35558ee03527b57cfacccf272f405ca427d0b2 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 11:31:45 -0800
Subject: [PATCH 06/53] add utils

---
 utils/addjson.py     | 155 +++++++++++
 utils/scp2json.py    |  48 ++++
 utils/tokenizer.perl | 596 +++++++++++++++++++++++++++++++++++++++++++
 utils/update_json.sh |  88 +++++++
 4 files changed, 887 insertions(+)
 create mode 100755 utils/addjson.py
 create mode 100755 utils/scp2json.py
 create mode 100644 utils/tokenizer.perl
 create mode 100755 utils/update_json.sh

diff --git a/utils/addjson.py b/utils/addjson.py
new file mode 100755
index 000000000..7fabe625e
--- /dev/null
+++ b/utils/addjson.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import json
+import logging
+import sys
+
+from distutils.util import strtobool
+
+from espnet.utils.cli_utils import get_commandline_args
+
+is_python2 = sys.version_info[0] == 2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="add multiple json values to an input or output value",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("jsons", type=str, nargs="+", help="json files")
+    parser.add_argument(
+        "-i",
+        "--is-input",
+        default=True,
+        type=strtobool,
+        help="If true, add to input. If false, add to output",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    # logging info
+    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
+    if args.verbose > 0:
+        logging.basicConfig(level=logging.INFO, format=logfmt)
+    else:
+        logging.basicConfig(level=logging.WARN, format=logfmt)
+    logging.info(get_commandline_args())
+
+    # make intersection set for utterance keys
+    js = []
+    intersec_ks = []
+    for x in args.jsons:
+        with codecs.open(x, "r", encoding="utf-8") as f:
+            j = json.load(f)
+        ks = j["utts"].keys()
+        logging.info(x + ": has " + str(len(ks)) + " utterances")
+        if len(intersec_ks) > 0:
+            intersec_ks = intersec_ks.intersection(set(ks))
+            if len(intersec_ks) == 0:
+                logging.warning("Empty intersection")
+                break
+        else:
+            intersec_ks = set(ks)
+        js.append(j)
+    logging.info("new json has " + str(len(intersec_ks)) + " utterances")
+
+    # updated original dict to keep intersection
+    intersec_org_dic = dict()
+    for k in intersec_ks:
+        v = js[0]["utts"][k]
+        intersec_org_dic[k] = v
+
+    intersec_add_dic = dict()
+    for k in intersec_ks:
+        v = js[1]["utts"][k]
+        for j in js[2:]:
+            v.update(j["utts"][k])
+        intersec_add_dic[k] = v
+
+    new_dic = dict()
+    for key_id in intersec_org_dic:
+        orgdic = intersec_org_dic[key_id]
+        adddic = intersec_add_dic[key_id]
+
+        if "utt2spk" not in orgdic:
+            orgdic["utt2spk"] = ""
+        # NOTE: for machine translation
+
+        # add as input
+        if args.is_input:
+            # original input
+            input_list = orgdic["input"]
+            # additional input
+            in_add_dic = {}
+            if "idim" in adddic and "ilen" in adddic:
+                in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])]
+            elif "idim" in adddic:
+                in_add_dic["shape"] = [int(adddic["idim"])]
+            # add all other key value
+            for key, value in adddic.items():
+                if key in ["idim", "ilen"]:
+                    continue
+                in_add_dic[key] = value
+            # add name
+            in_add_dic["name"] = "input%d" % (len(input_list) + 1)
+
+            input_list.append(in_add_dic)
+            new_dic[key_id] = {
+                "input": input_list,
+                "output": orgdic["output"],
+                "utt2spk": orgdic["utt2spk"],
+            }
+        # add as output
+        else:
+            # original output
+            output_list = orgdic["output"]
+            # additional output
+            out_add_dic = {}
+            # add shape
+            if "odim" in adddic and "olen" in adddic:
+                out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])]
+            elif "odim" in adddic:
+                out_add_dic["shape"] = [int(adddic["odim"])]
+            # add all other key value
+            for key, value in adddic.items():
+                if key in ["odim", "olen"]:
+                    continue
+                out_add_dic[key] = value
+            # add name
+            out_add_dic["name"] = "target%d" % (len(output_list) + 1)
+
+            output_list.append(out_add_dic)
+            new_dic[key_id] = {
+                "input": orgdic["input"],
+                "output": output_list,
+                "utt2spk": orgdic["utt2spk"],
+            }
+            if "lang" in orgdic.keys():
+                new_dic[key_id]["lang"] = orgdic["lang"]
+
+    # ensure "ensure_ascii=False", which is a bug
+    jsonstring = json.dumps(
+        {"utts": new_dic},
+        indent=4,
+        ensure_ascii=False,
+        sort_keys=True,
+        separators=(",", ": "),
+    )
+    sys.stdout = codecs.getwriter("utf-8")(
+        sys.stdout if is_python2 else sys.stdout.buffer
+    )
+    print(jsonstring)
diff --git a/utils/scp2json.py b/utils/scp2json.py
new file mode 100755
index 000000000..8e8de3e08
--- /dev/null
+++ b/utils/scp2json.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import codecs
+import json
+import sys
+
+is_python2 = sys.version_info[0] == 2
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description="convert scp to json",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument("--key", "-k", type=str, help="key")
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()
+    args = parser.parse_args()
+
+    new_line = {}
+    sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
+    sys.stdout = codecs.getwriter("utf-8")(
+        sys.stdout if is_python2 else sys.stdout.buffer
+    )
+    line = sys.stdin.readline()
+    while line:
+        x = line.rstrip().split()
+        v = {args.key: " ".join(x[1:])}
+        new_line[x[0]] = v
+        line = sys.stdin.readline()
+
+    all_l = {"utts": new_line}
+
+    # ensure "ensure_ascii=False", which is a bug
+    jsonstring = json.dumps(
+        all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
+    )
+    print(jsonstring)
diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl
new file mode 100644
index 000000000..ae97d6582
--- /dev/null
+++ b/utils/tokenizer.perl
@@ -0,0 +1,596 @@
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+use warnings;
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use warnings;
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+
+if  (eval {require Thread;1;}) {
+  #module loaded
+  Thread->import();
+}
+
+my $mydir = "$RealBin/../share/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my @protected_patterns = ();
+my $protected_patterns_file = "";
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+my $PENN = 0;
+my $NO_ESCAPING = 0;
+while (@ARGV)
+{
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-x$/ && ($SKIP_XML = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+	/^-time$/ && ($TIMING = 1, next);
+  # Option to add list of regexps to be protected
+  /^-protected/ && ($protected_patterns_file = shift, next);
+	/^-threads$/ && ($NUM_THREADS = int(shift), next);
+	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+	/^-penn$/ && ($PENN = 1, next);
+	/^-no-escape/ && ($NO_ESCAPING = 1, next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+    $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP)
+{
+	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+        print "Options:\n";
+        print "  -q     ... quiet.\n";
+        print "  -a     ... aggressive hyphen splitting.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -time  ... enable processing time calculation.\n";
+        print "  -penn  ... use Penn treebank-like tokenization.\n";
+        print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
+	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
+	exit;
+}
+
+if (!$QUIET)
+{
+	print STDERR "Tokenizer Version 1.1\n";
+	print STDERR "Language: $language\n";
+	print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+# Load protected patterns
+if ($protected_patterns_file)
+{
+  open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
+  while(<PP>) {
+    chomp;
+    push @protected_patterns, $_;
+  }
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+    while(<STDIN>)
+    {
+        $count_sentences = $count_sentences + 1;
+        push(@batch_sentences, $_);
+        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+        {
+            # assign each thread work
+            for (my $i=0; $i<$NUM_THREADS; $i++)
+            {
+                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+                push(@thread_list, $new_thread);
+            }
+            foreach (@thread_list)
+            {
+                my $tokenized_list = $_->join;
+                foreach (@$tokenized_list)
+                {
+                    print $_;
+                }
+            }
+            # reset for the new run
+            @thread_list = ();
+            @batch_sentences = ();
+        }
+    }
+    # the last batch
+    if (scalar(@batch_sentences)>0)
+    {
+        # assign each thread work
+        for (my $i=0; $i<$NUM_THREADS; $i++)
+        {
+            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+            if ($start_index >= scalar(@batch_sentences))
+            {
+                last;
+            }
+            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+            if ($end_index >= scalar(@batch_sentences))
+            {
+                $end_index = scalar(@batch_sentences)-1;
+            }
+            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+            push(@thread_list, $new_thread);
+        }
+        foreach (@thread_list)
+        {
+            my $tokenized_list = $_->join;
+            foreach (@$tokenized_list)
+            {
+                print $_;
+            }
+        }
+    }
+}
+else
+{# single thread only
+    while(<STDIN>)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            print $_;
+        }
+        else
+        {
+            print &tokenize($_);
+        }
+    }
+}
+
+if ($TIMING)
+{
+    my $duration = Time::HiRes::tv_interval( $start_time );
+    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array containing a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+    my(@text_list) = @_;
+    my(@tokenized_list) = ();
+    foreach (@text_list)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            push(@tokenized_list, $_);
+        }
+        else
+        {
+            push(@tokenized_list, &tokenize($_));
+        }
+    }
+    return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+    my($text) = @_;
+
+    if ($PENN) {
+      return tokenize_penn($text);
+    }
+
+    chomp($text);
+    $text = " $text ";
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # Find protected patterns
+    my @protected = ();
+    foreach my $protected_pattern (@protected_patterns) {
+      my $t = $text;
+      while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
+        push @protected, $+{PATTERN};
+        $t = $+{TAIL};
+      }
+    }
+
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s,\Q$protected[$i], $subst ,g;
+    }
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    # separate out all "other" special characters
+    if (($language eq "fi") or ($language eq "sv")) {
+        # in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
+        # USA:n, 20:een, EU:ssa, USA:s, S:t
+        $text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
+        # if a colon is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
+    }
+    elsif ($language eq "tdt") {
+        # in Tetun, the apostrophe can be used inside words as an apostrophe-like character:
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+        # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g;
+    }
+    elsif (($language eq "ca")) {
+        # in Catalan, the middle dot can be used inside words:
+        # il�lusio
+        $text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g;
+        # if a middot is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g;
+    }   
+    else {
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+    }
+
+    # aggressive hyphen splitting
+    if ($AGGRESSIVE)
+    {
+        $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
+    }
+
+    #multi-dots stay together
+    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+    while($text =~ /DOTMULTI\./)
+    {
+        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+    }
+
+    # seperate out "," except if within numbers (5,300)
+    #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+
+    # separate out "," except if within numbers (5,300)
+    # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
+    # first application uses up B so rule can't see B,C
+    # two-step version here may create extra spaces but these are removed later
+    # will also space digit,letter or letter,digit forms (redundant with next section)
+    $text =~ s/([^\p{IsN}])[,]/$1 , /g;
+    $text =~ s/[,]([^\p{IsN}])/ , $1/g;
+    
+    # separate "," after a number if it's the end of a sentence
+    $text =~ s/([\p{IsN}])[,]$/$1 ,/g;
+
+    # separate , pre and post number
+    #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    # turn `into '
+    #$text =~ s/\`/\'/g;
+
+    #turn '' into "
+    #$text =~ s/\'\'/ \" /g;
+
+    if ($language eq "en")
+    {
+        #split contractions right
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+        #special case for "1990's"
+        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+    }
+    elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca"))
+    {
+        #split contractions left
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+    }
+    elsif (($language eq "so")  or ($language eq "tdt"))
+    {
+        # Don't split glottals
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+    }
+    else
+    {
+        $text =~ s/\'/ \' /g;
+    }
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if ($i == scalar(@words)-1) {
+                # split last words independently as they are unlikely to be non-breaking prefixes
+                $word = $pre." .";
+            }
+            elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # clean up extraneous spaces
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    # .' at end of sentence is missed
+    $text =~ s/\.\' ?$/ . ' /;
+
+    # restore protected
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s/$subst/$protected[$i]/g;
+    }
+
+    #restore multi-dots
+    while($text =~ /DOTDOTMULTI/)
+    {
+        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+    }
+    $text =~ s/DOTMULTI/./g;
+
+    #escape special chars
+    if (!$NO_ESCAPING)
+      {
+	$text =~ s/\&/\&amp;/g;   # escape escape
+	$text =~ s/\|/\&#124;/g;  # factor separator
+	$text =~ s/\</\&lt;/g;    # xml
+	$text =~ s/\>/\&gt;/g;    # xml
+	$text =~ s/\'/\&apos;/g;  # xml
+	$text =~ s/\"/\&quot;/g;  # xml
+	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+      }
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub tokenize_penn
+{
+    # Improved compatibility with Penn Treebank tokenization.  Useful if
+    # the text is to later be parsed with a PTB-trained parser.
+    #
+    # Adapted from Robert MacIntyre's sed script:
+    #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
+
+    my($text) = @_;
+    chomp($text);
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # attempt to get correct directional quotes
+    $text =~ s/^``/`` /g;
+    $text =~ s/^"/`` /g;
+    $text =~ s/^`([^`])/` $1/g;
+    $text =~ s/^'/`  /g;
+    $text =~ s/([ ([{<])"/$1 `` /g;
+    $text =~ s/([ ([{<])``/$1 `` /g;
+    $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
+    $text =~ s/([ ([{<])'/$1 ` /g;
+    # close quotes handled at end
+
+    $text =~ s=\.\.\.= _ELLIPSIS_ =g;
+
+    # separate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
+$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
+
+    # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
+    # the tokens should be merged prior to parsing with a PTB-trained parser
+    # (see syntax-hyphen-splitting.perl).
+    $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
+
+    # Assume sentence tokenization has been done first, so split FINAL periods
+    # only.
+    $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
+    # however, we may as well split ALL question marks and exclamation points,
+    # since they shouldn't have the abbrev.-marker ambiguity problem
+    $text =~ s=([?!])= $1 =g;
+
+    # parentheses, brackets, etc.
+    $text =~ s=([\]\[\(\){}<>])= $1 =g;
+    $text =~ s/\(/-LRB-/g;
+    $text =~ s/\)/-RRB-/g;
+    $text =~ s/\[/-LSB-/g;
+    $text =~ s/\]/-RSB-/g;
+    $text =~ s/{/-LCB-/g;
+    $text =~ s/}/-RCB-/g;
+
+    $text =~ s=--= -- =g;
+
+    # First off, add a space to the beginning and end of each line, to reduce
+    # necessary number of regexps.
+    $text =~ s=$= =;
+    $text =~ s=^= =;
+
+    $text =~ s="= '' =g;
+    # possessive or close-single-quote
+    $text =~ s=([^'])' =$1 ' =g;
+    # as in it's, I'm, we'd
+    $text =~ s='([sSmMdD]) = '$1 =g;
+    $text =~ s='ll = 'll =g;
+    $text =~ s='re = 're =g;
+    $text =~ s='ve = 've =g;
+    $text =~ s=n't = n't =g;
+    $text =~ s='LL = 'LL =g;
+    $text =~ s='RE = 'RE =g;
+    $text =~ s='VE = 'VE =g;
+    $text =~ s=N'T = N'T =g;
+
+    $text =~ s= ([Cc])annot = $1an not =g;
+    $text =~ s= ([Dd])'ye = $1' ye =g;
+    $text =~ s= ([Gg])imme = $1im me =g;
+    $text =~ s= ([Gg])onna = $1on na =g;
+    $text =~ s= ([Gg])otta = $1ot ta =g;
+    $text =~ s= ([Ll])emme = $1em me =g;
+    $text =~ s= ([Mm])ore'n = $1ore 'n =g;
+    $text =~ s= '([Tt])is = '$1 is =g;
+    $text =~ s= '([Tt])was = '$1 was =g;
+    $text =~ s= ([Ww])anna = $1an na =g;
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # restore ellipses
+    $text =~ s=_ELLIPSIS_=\.\.\.=g;
+
+    # clean out extra spaces
+    $text =~ s=  *= =g;
+    $text =~ s=^ *==g;
+    $text =~ s= *$==g;
+
+    #escape special chars
+    $text =~ s/\&/\&amp;/g;   # escape escape
+    $text =~ s/\|/\&#124;/g;  # factor separator
+    $text =~ s/\</\&lt;/g;    # xml
+    $text =~ s/\>/\&gt;/g;    # xml
+    $text =~ s/\'/\&apos;/g;  # xml
+    $text =~ s/\"/\&quot;/g;  # xml
+    $text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    $text =~ s/\]/\&#93;/g;   # syntax non-terminal
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub load_prefixes
+{
+    my ($language, $PREFIX_REF) = @_;
+
+    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+    #default back to English if we don't have a language-specific prefix file
+    if (!(-e $prefixfile))
+    {
+        $prefixfile = "$mydir/nonbreaking_prefix.en";
+        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
+
+    if (-e "$prefixfile")
+    {
+        open(PREFIX, "<:utf8", "$prefixfile");
+        while (<PREFIX>)
+        {
+            my $item = $_;
+            chomp($item);
+            if (($item) && (substr($item,0,1) ne "#"))
+            {
+                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+                {
+                    $PREFIX_REF->{$1} = 2;
+                }
+                else
+                {
+                    $PREFIX_REF->{$item} = 1;
+                }
+            }
+        }
+        close(PREFIX);
+    }
+}
\ No newline at end of file
diff --git a/utils/update_json.sh b/utils/update_json.sh
new file mode 100755
index 000000000..bf6974755
--- /dev/null
+++ b/utils/update_json.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# Copyright 2020 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+echo "$0 $*" >&2 # Print the command line for logging
+. ./path.sh
+
+nlsyms=""
+oov="<unk>"
+bpecode=""
+verbose=0
+
+text=""
+multilingual=false
+
+help_message=$(cat << EOF
+Usage: $0 <json> <data-dir> <dict>
+e.g. $0 data/train data/lang_1char/train_units.txt
+Options:
+  --oov <oov-word>                                 # Default: <unk>
+  --verbose <num>                                  # Default: 0
+EOF
+)
+. utils/parse_options.sh
+
+if [ $# != 3 ]; then
+    echo "${help_message}" 1>&2
+    exit 1;
+fi
+
+set -euo pipefail
+
+json=$1
+dir=$2
+dic=$3
+json_dir=$(dirname ${json})
+tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
+trap 'rm -rf ${tmpdir}' EXIT
+
+if [ -z ${text} ]; then
+    text=${dir}/text
+fi
+
+# 2. Create scp files for outputs
+mkdir -p ${tmpdir}/output
+if [ -n "${bpecode}" ]; then
+    if [ ${multilingual} = true ]; then
+        # remove a space before the language ID
+        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
+            | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
+            > ${tmpdir}/output/token.scp
+    else
+        paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
+            | spm_encode --model=${bpecode} --output_format=piece) \
+            > ${tmpdir}/output/token.scp
+    fi
+elif [ -n "${nlsyms}" ]; then
+    text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
+else
+    text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
+fi
+< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
+awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
+# +2 comes from CTC blank and EOS
+vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
+odim=$(echo "$vocsize + 2" | bc)
+awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp
+
+cat ${text} > ${tmpdir}/output/text.scp
+
+
+# 4. Create JSON files from each scp files
+rm -f ${tmpdir}/*/*.json
+for x in "${tmpdir}"/output/*.scp; do
+    k=$(basename ${x} .scp)
+    < ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
+done
+
+# add to json
+addjson.py --verbose ${verbose} -i false \
+  ${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
+mkdir -p ${json_dir}/.backup
+echo "json updated. original json is kept in ${json_dir}/.backup."
+cp ${json} ${json_dir}/.backup/"$(basename ${json})"
+cp ${tmpdir}/data.json ${json}
+
+rm -fr ${tmpdir}

From 48207c14107a0de7d0c54d008220b1be832ba615 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 11:34:21 -0800
Subject: [PATCH 07/53] process scripts and configs

---
 examples/ted_en_zh/st1/conf/fbank.conf    |   2 +
 examples/ted_en_zh/st1/conf/pitch.conf    |   1 +
 examples/ted_en_zh/st1/local/ted_en_zh.py | 104 ++++++++++++++++++++++
 examples/ted_en_zh/st1/steps              |   1 +
 examples/ted_en_zh/st1/utils              |   1 +
 5 files changed, 109 insertions(+)
 create mode 100644 examples/ted_en_zh/st1/conf/fbank.conf
 create mode 100644 examples/ted_en_zh/st1/conf/pitch.conf
 create mode 100644 examples/ted_en_zh/st1/local/ted_en_zh.py
 create mode 120000 examples/ted_en_zh/st1/steps
 create mode 120000 examples/ted_en_zh/st1/utils

diff --git a/examples/ted_en_zh/st1/conf/fbank.conf b/examples/ted_en_zh/st1/conf/fbank.conf
new file mode 100644
index 000000000..82ac7bd0d
--- /dev/null
+++ b/examples/ted_en_zh/st1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/examples/ted_en_zh/st1/conf/pitch.conf b/examples/ted_en_zh/st1/conf/pitch.conf
new file mode 100644
index 000000000..e959a19d5
--- /dev/null
+++ b/examples/ted_en_zh/st1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/examples/ted_en_zh/st1/local/ted_en_zh.py b/examples/ted_en_zh/st1/local/ted_en_zh.py
new file mode 100644
index 000000000..f30573b7e
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/ted_en_zh.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import codecs
+import os
+
+
+# org_split = 'train-split/train-segment'
+# text_file = 'En-Zh/train.en-zh'
+# data_split = 'train'
+def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list,
+                 data_split_list):
+
+    for org_split, text_file, data_split in zip(wav_dir_list, text_file_list,
+                                                data_split_list):
+        local_data_split_dir = os.path.join(tgt_dir, data_split)
+
+        os.makedirs(local_data_split_dir, exist_ok=True)
+        utts = []
+        utt2spk = {}
+        with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \
+            open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf:
+            for files in os.listdir(os.path.join(src_dir, org_split)):
+                files = files.strip()
+                file_path = os.path.join(src_dir, org_split, files)
+                size = os.path.getsize(file_path)
+                if size <= 30000:
+                    continue
+                utt = files.split('.')[0]
+                audio_name = utt.split('_')[0]
+                #format the name of utterance 
+                while len(audio_name) < 6:
+                    utt = '0' + utt
+                    audio_name = '0' + audio_name
+                utt = 'ted-en-zh-' + utt
+                utts.append(utt)
+                spk = utt.split('_')[0]
+                utt2spk[utt] = spk
+                assert len(spk) == 16, "%r" % spk
+                print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf)
+            for utt in sorted(utts):
+                print(utt, utt2spk[utt], file=utt2spk_wf)
+
+        with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \
+            open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \
+            open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \
+            codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8',
+                        errors='ignore') as rf:
+            count = 0
+            for line in rf:
+                line = line.strip()
+                line_spl = line.split('\t')
+                assert len(line_spl) == 3, "%r" % line
+                wav, en, zh = line_spl
+                assert wav.endswith('wav'), "%r" % wav[-3:]
+                utt = wav.split('.')[0]
+                audio_name = utt.split('_')[0]
+                while len(audio_name) < 6:
+                    utt = '0' + utt
+                    audio_name = '0' + audio_name
+                utt = 'ted-en-zh-' + utt
+                print(utt, file=yaml_wf)
+                print(en.lower(), file=en_wf)
+                print(zh, file=zh_wf)
+                count += 1
+            print('%s set lines count: %d' % (data_split, count))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=__doc__)
+
+    parser.add_argument(
+        "--src-dir",
+        default="",
+        type=str,
+        help="Directory to kaldi splited data. (default: %(default)s)")
+    parser.add_argument(
+        "--tgt-dir",
+        default="local/ted_en_zh",
+        type=str,
+        help="Directory to save processed data. (default: %(default)s)")
+    args = parser.parse_args()
+
+    wav_dir_list = [
+        'train-split/train-segment', 'test-segment/tst2014',
+        'test-segment/tst2015'
+    ]
+    text_file_list = [
+        'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh'
+    ]
+    data_split_list = ['train', 'dev', 'test']
+    data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list,
+                 data_split_list)
diff --git a/examples/ted_en_zh/st1/steps b/examples/ted_en_zh/st1/steps
new file mode 120000
index 000000000..91f2d234e
--- /dev/null
+++ b/examples/ted_en_zh/st1/steps
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/steps
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/utils b/examples/ted_en_zh/st1/utils
new file mode 120000
index 000000000..f49247da8
--- /dev/null
+++ b/examples/ted_en_zh/st1/utils
@@ -0,0 +1 @@
+../../../tools/kaldi/egs/wsj/s5/utils
\ No newline at end of file

From e867f3bb416a0c7b8349995ad2ff3f2c97fc6b4e Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 14:02:05 -0800
Subject: [PATCH 08/53] minor

---
 examples/ted_en_zh/st1/local/data.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh
index 8b829a8a1..c61c9a9fc 100755
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -2,7 +2,7 @@
 
 set -e
 
-stage=3
+stage=-1
 stop_stage=100
 dict_dir=data/lang_char
 

From d2fab3238b7082ee5a5df33d6725514cf4cceb05 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 16:57:36 -0800
Subject: [PATCH 09/53] fix bugs

---
 paddlespeech/s2t/frontend/utility.py | 8 ++++----
 paddlespeech/s2t/io/sampler.py       | 2 +-
 paddlespeech/s2t/utils/checkpoint.py | 3 +++
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 703f2127d..d423a6044 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -102,10 +102,10 @@ def read_manifest(
     manifest = []
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
-            feat_len = json_data["feat_shape"][
-                0] if 'feat_shape' in json_data else 1.0
-            token_len = json_data["token_shape"][
-                0] if 'token_shape' in json_data else 1.0
+            feat_len = json_data["input"][0]["shape"][
+                0] if 'shape' in json_data["input"][0] else 1.0
+            token_len = json_data["output"][0]["shape"][
+                0] if 'shape' in json_data["output"][0] else 1.0
             conditions = [
                 feat_len >= min_input_len,
                 feat_len <= max_input_len,
diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py
index 35b57524b..0d5a16ce1 100644
--- a/paddlespeech/s2t/io/sampler.py
+++ b/paddlespeech/s2t/io/sampler.py
@@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
     """
     rng = np.random.RandomState(epoch)
     shift_len = rng.randint(0, batch_size - 1)
-    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
+    batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
     rng.shuffle(batch_indices)
     batch_indices = [item for batch in batch_indices for item in batch]
     assert clipped is False
diff --git a/paddlespeech/s2t/utils/checkpoint.py b/paddlespeech/s2t/utils/checkpoint.py
index 5105f95ef..4c493715a 100644
--- a/paddlespeech/s2t/utils/checkpoint.py
+++ b/paddlespeech/s2t/utils/checkpoint.py
@@ -94,6 +94,9 @@ class Checkpoint():
         """
         configs = {}
 
+        if len(checkpoint_path) == 0 or checkpoint_path == "None":
+            checkpoint_path = None
+
         if checkpoint_path is not None:
             pass
         elif checkpoint_dir is not None and record_file is not None:

From 3c8e87344a4ce38617adabd44f3496157e9e80e8 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 16:59:50 -0800
Subject: [PATCH 10/53] update run scripts

---
 .../st1/conf/transformer_mtl_noam.yaml        |  4 +-
 examples/ted_en_zh/st1/local/data.sh          |  2 +-
 .../ted_en_zh/st1/local/train_finetune.sh     | 39 -------------------
 examples/ted_en_zh/st1/run.sh                 | 17 ++++----
 4 files changed, 11 insertions(+), 51 deletions(-)
 delete mode 100755 examples/ted_en_zh/st1/local/train_finetune.sh

diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
index b4fb51075..3175aad9f 100644
--- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml
@@ -11,9 +11,9 @@ data:
   max_output_input_ratio: 20.0
 
 collator:
-  vocab_filepath: data/lang_char/vocab.txt
+  vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt
   unit_type: 'spm'
-  spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc
+  spm_model_prefix: data/lang_char/ted_en_zh_bpe8000
   mean_std_filepath: ""
   # augmentation_config: conf/augmentation.json
   batch_size: 10
diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh
index c61c9a9fc..f9c876b16 100755
--- a/examples/ted_en_zh/st1/local/data.sh
+++ b/examples/ted_en_zh/st1/local/data.sh
@@ -166,7 +166,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir}
 fi
 
-dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt
+dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}.txt
 nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt
 bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe}
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
diff --git a/examples/ted_en_zh/st1/local/train_finetune.sh b/examples/ted_en_zh/st1/local/train_finetune.sh
deleted file mode 100755
index e54c7fff4..000000000
--- a/examples/ted_en_zh/st1/local/train_finetune.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-
-if [ $# != 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
-    exit -1
-fi
-
-ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
-echo "using $ngpu gpus..."
-
-config_path=$1
-ckpt_name=$2
-ckpt_path=$3
-
-mkdir -p exp
-
-# seed may break model convergence
-seed=0
-if [ ${seed} != 0 ]; then
-    export FLAGS_cudnn_deterministic=True
-fi
-
-python3 -u ${BIN_DIR}/train.py \
---ngpu ${ngpu} \
---config ${config_path} \
---output exp/${ckpt_name} \
---checkpoint_path ${ckpt_path} \
---seed ${seed}
-
-if [ ${seed} != 0 ]; then
-    unset FLAGS_cudnn_deterministic
-fi
-
-if [ $? -ne 0 ]; then
-    echo "Failed in training!"
-    exit 1
-fi
-
-exit 0
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh
index f8adf4f65..a1c99af30 100755
--- a/examples/ted_en_zh/st1/run.sh
+++ b/examples/ted_en_zh/st1/run.sh
@@ -6,7 +6,7 @@ gpus=0,1,2,3
 stage=1
 stop_stage=4
 conf_path=conf/transformer_mtl_noam.yaml
-ckpt_path=paddle.98
+ckpt= # paddle.98 # (finetune from FAT-ST pretrained model)
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -22,21 +22,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # download pretrained
-    bash ./local/download_pretrain.sh || exit -1
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train_finetune.sh ${conf_path}  ${ckpt} ${ckpt_path}
+    if [ -n "${ckpt_path}" ]; then
+        echo "Finetune from Pretrained Model" ${ckpt_path}
+        ./local/download_pretrain.sh || exit -1
+    fi 
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
 fi
 
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # avg n best model
     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
 fi
 
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # test ckpt avg_n
     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
\ No newline at end of file

From 0cc81a52cdae3783972c4fa25d8de33784fb7f97 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 17:00:09 -0800
Subject: [PATCH 11/53] update format

---
 utils/addjson.py  | 27 ++++++++++++---------------
 utils/scp2json.py | 19 ++++++++++---------
 2 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/utils/addjson.py b/utils/addjson.py
index 7fabe625e..013d14727 100755
--- a/utils/addjson.py
+++ b/utils/addjson.py
@@ -1,9 +1,7 @@
 #!/usr/bin/env python3
 # encoding: utf-8
-
 # Copyright 2018 Nagoya University (Tomoki Hayashi)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
 from __future__ import print_function
 from __future__ import unicode_literals
 
@@ -12,7 +10,6 @@ import codecs
 import json
 import logging
 import sys
-
 from distutils.util import strtobool
 
 from espnet.utils.cli_utils import get_commandline_args
@@ -23,17 +20,16 @@ is_python2 = sys.version_info[0] == 2
 def get_parser():
     parser = argparse.ArgumentParser(
         description="add multiple json values to an input or output value",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
     parser.add_argument("jsons", type=str, nargs="+", help="json files")
     parser.add_argument(
         "-i",
         "--is-input",
         default=True,
         type=strtobool,
-        help="If true, add to input. If false, add to output",
-    )
-    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+        help="If true, add to input. If false, add to output", )
+    parser.add_argument(
+        "--verbose", "-V", default=0, type=int, help="Verbose option")
     return parser
 
 
@@ -121,7 +117,8 @@ if __name__ == "__main__":
             out_add_dic = {}
             # add shape
             if "odim" in adddic and "olen" in adddic:
-                out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])]
+                out_add_dic[
+                    "shape"] = [int(adddic["olen"]), int(adddic["odim"])]
             elif "odim" in adddic:
                 out_add_dic["shape"] = [int(adddic["odim"])]
             # add all other key value
@@ -143,13 +140,13 @@ if __name__ == "__main__":
 
     # ensure "ensure_ascii=False", which is a bug
     jsonstring = json.dumps(
-        {"utts": new_dic},
+        {
+            "utts": new_dic
+        },
         indent=4,
         ensure_ascii=False,
         sort_keys=True,
-        separators=(",", ": "),
-    )
-    sys.stdout = codecs.getwriter("utf-8")(
-        sys.stdout if is_python2 else sys.stdout.buffer
-    )
+        separators=(",", ": "), )
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout
+                                           if is_python2 else sys.stdout.buffer)
     print(jsonstring)
diff --git a/utils/scp2json.py b/utils/scp2json.py
index 8e8de3e08..e2a757665 100755
--- a/utils/scp2json.py
+++ b/utils/scp2json.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
 # encoding: utf-8
-
 # Copyright 2017 Johns Hopkins University (Shinji Watanabe)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 from __future__ import print_function
@@ -17,8 +16,7 @@ is_python2 = sys.version_info[0] == 2
 def get_parser():
     parser = argparse.ArgumentParser(
         description="convert scp to json",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
     parser.add_argument("--key", "-k", type=str, help="key")
     return parser
 
@@ -28,10 +26,10 @@ if __name__ == "__main__":
     args = parser.parse_args()
 
     new_line = {}
-    sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
-    sys.stdout = codecs.getwriter("utf-8")(
-        sys.stdout if is_python2 else sys.stdout.buffer
-    )
+    sys.stdin = codecs.getreader("utf-8")(sys.stdin
+                                          if is_python2 else sys.stdin.buffer)
+    sys.stdout = codecs.getwriter("utf-8")(sys.stdout
+                                           if is_python2 else sys.stdout.buffer)
     line = sys.stdin.readline()
     while line:
         x = line.rstrip().split()
@@ -43,6 +41,9 @@ if __name__ == "__main__":
 
     # ensure "ensure_ascii=False", which is a bug
     jsonstring = json.dumps(
-        all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ")
-    )
+        all_l,
+        indent=4,
+        ensure_ascii=False,
+        sort_keys=True,
+        separators=(",", ": "))
     print(jsonstring)

From 351e4e8e87e1b5b678c4aded167cb735327da4ee Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 17:01:16 -0800
Subject: [PATCH 12/53] training script

---
 examples/ted_en_zh/st1/local/train.sh | 39 +++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)
 create mode 100755 examples/ted_en_zh/st1/local/train.sh

diff --git a/examples/ted_en_zh/st1/local/train.sh b/examples/ted_en_zh/st1/local/train.sh
new file mode 100755
index 000000000..a8e4acaa0
--- /dev/null
+++ b/examples/ted_en_zh/st1/local/train.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+if [ $# != 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+ckpt_path=$3
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=0
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--checkpoint_path "${ckpt_path}" \
+--seed ${seed}
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
\ No newline at end of file

From 79060e20e3d5f6285f49b503f95b8db9ddce9294 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 30 Nov 2021 10:34:43 +0800
Subject: [PATCH 13/53] Update pack_model.sh

---
 utils/pack_model.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/pack_model.sh b/utils/pack_model.sh
index 8acd59a64..d7df01eb7 100755
--- a/utils/pack_model.sh
+++ b/utils/pack_model.sh
@@ -57,7 +57,7 @@ else
     echo "missing ${dec_conf}"
     exit 1
 fi
-# NOTE(kan-bayashi): preprocess conf is optional
+# preprocess conf is optional
 if [ -n "${preprocess_conf}" ]; then
     tar rfh ${outfile}.tar ${preprocess_conf}
     echo -n "    - preprocess config file: \`"

From 507c3b52eab46beb411314e90e2928042abd1065 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 30 Nov 2021 10:35:16 +0800
Subject: [PATCH 14/53] Update default.yaml

---
 examples/csmsc/voc3/conf/default.yaml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml
index cc27220fc..5dda835ae 100644
--- a/examples/csmsc/voc3/conf/default.yaml
+++ b/examples/csmsc/voc3/conf/default.yaml
@@ -6,8 +6,7 @@
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+# to converge). 
 
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #
@@ -136,4 +135,4 @@ eval_interval_steps: 1000               # Interval steps to evaluate the network
 #                     OTHER SETTING                       #
 ###########################################################
 num_snapshots: 10                 # max number of snapshots to keep while training
-seed: 42                          # random seed for paddle, random, and np.random
\ No newline at end of file
+seed: 42                          # random seed for paddle, random, and np.random

From 2de7bc14b085f9b835a4acdc350475405a310ecf Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 30 Nov 2021 10:35:29 +0800
Subject: [PATCH 15/53] Update finetune.yaml

---
 examples/csmsc/voc3/conf/finetune.yaml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml
index 80ab6bed7..302274019 100644
--- a/examples/csmsc/voc3/conf/finetune.yaml
+++ b/examples/csmsc/voc3/conf/finetune.yaml
@@ -6,8 +6,7 @@
 # This configuration is based on full-band MelGAN but the hop size and sampling
 # rate is different from the paper (16kHz vs 24kHz). The number of iteraions
 # is not shown in the paper so currently we train 1M iterations (not sure enough
-# to converge). The optimizer setting is based on @dathudeptrai advice.
-# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906
+# to converge). 
 
 ###########################################################
 #                FEATURE EXTRACTION SETTING               #

From c94ebdc52cdcf52b9e400fe2090efc953f895b4e Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Tue, 30 Nov 2021 14:22:32 +0800
Subject: [PATCH 16/53] Add python api for executor.

---
 paddlespeech/cli/executor.py  | 15 +++++++++++++++
 paddlespeech/cli/s2t/infer.py | 19 +++++++++++++++----
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index 2261e011b..2314bd6d3 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -14,6 +14,7 @@
 import os
 from abc import ABC
 from abc import abstractmethod
+from typing import List
 from typing import Union
 
 import paddle
@@ -64,3 +65,17 @@ class BaseExecutor(ABC):
             Output postprocess and return human-readable results such as texts and audio files.
         """
         pass
+
+    @abstractmethod
+    def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
+        pass
+
+    @abstractmethod
+    def __call__(self, *arg, **kwargs):
+        """
+            Python API to call an executor.
+        """
+        pass
diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/s2t/infer.py
index 6aa29addf..9509e311c 100644
--- a/paddlespeech/cli/s2t/infer.py
+++ b/paddlespeech/cli/s2t/infer.py
@@ -126,6 +126,9 @@ class S2TExecutor(BaseExecutor):
         pass
 
     def execute(self, argv: List[str]) -> bool:
+        """
+            Command line entry.
+        """
         parser_args = self.parser.parse_args(argv)
         print(parser_args)
 
@@ -137,12 +140,20 @@ class S2TExecutor(BaseExecutor):
         device = parser_args.device
 
         try:
-            self._init_from_path(model, lang, config, ckpt_path)
-            self.preprocess(audio_file)
-            self.infer()
-            res = self.postprocess()  # Retrieve result of s2t.
+            res = self(model, lang, config, ckpt_path, audio_file, device)
             print(res)
             return True
         except Exception as e:
             print(e)
             return False
+
+    def __call__(self, model, lang, config, ckpt_path, audio_file, device):
+        """
+            Python API to call an executor.
+        """
+        self._init_from_path(model, lang, config, ckpt_path)
+        self.preprocess(audio_file)
+        self.infer()
+        res = self.postprocess()  # Retrieve result of s2t.
+
+        return res

From f225b1d88ecdf92b758e999e28ff4e6d433d95f6 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 22:39:07 -0800
Subject: [PATCH 17/53] minor updates

---
 examples/ted_en_zh/st1/local/data_prep.sh | 54 -----------------------
 examples/ted_en_zh/st1/path.sh            |  3 +-
 examples/ted_en_zh/st1/run.sh             |  5 ++-
 3 files changed, 4 insertions(+), 58 deletions(-)
 delete mode 100755 examples/ted_en_zh/st1/local/data_prep.sh

diff --git a/examples/ted_en_zh/st1/local/data_prep.sh b/examples/ted_en_zh/st1/local/data_prep.sh
deleted file mode 100755
index 339cee1eb..000000000
--- a/examples/ted_en_zh/st1/local/data_prep.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-
-# Copyright 2019 Kyoto University (Hirofumi Inaguma)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-export LC_ALL=C
-
-data_dir=${1}
-
-for set in train dev test; do
-# for set in train; do
-    dst=${target_dir}/${set}
-    for lang in en zh; do
-
-        if [ ${lang} = 'en' ]; then
-            echo "remove punctuation $lang"
-            # remove punctuation
-            local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw
-        else
-            cp ${dst}/${lang}.org ${dst}/${lang}.raw
-        fi
-
-        paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang}
-
-
-    done
-    # error check
-    n=$(cat ${dst}/.yaml | wc -l)
-    n_en=$(cat ${dst}/en.raw | wc -l)
-    n_tgt=$(cat ${dst}/zh.raw | wc -l)
-    [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1;
-    [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1;
-
-    echo "done text processing"
-    cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp
-    cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk
-
-    cat ${dst}/utt2spk | utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt
-    rm -rf ${target_dir}/data_prep/${set}.en-zh
-    mkdir -p ${target_dir}/data_prep/${set}.en-zh
-    echo "remove duplicate lines..."
-    cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \
-        | sed 's/^[ \t]*//' > ${dst}/duplicate_lines
-    cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \
-        | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist
-    reduce_data_dir.sh ${dst} ${dst}/reclist ${target_dir}/data_prep/${set}.en-zh
-    echo "done wav processing"
-    for l in en zh; do
-        cp ${dst}/text.${l} ${target_dir}/data_prep/${set}.en-zh/text.${l}
-    done
-    fix_data_dir.sh --utt_extra_files \
-    "text.en text.zh" \
-    ${target_dir}/data_prep/${set}.en-zh
-done      
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh
index ee4c9779f..867cdb48a 100644
--- a/examples/ted_en_zh/st1/path.sh
+++ b/examples/ted_en_zh/st1/path.sh
@@ -19,5 +19,4 @@ export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
 [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
-[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
-export train_cmd="run.pl"
\ No newline at end of file
+[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
\ No newline at end of file
diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh
index a1c99af30..f6362a8b3 100755
--- a/examples/ted_en_zh/st1/run.sh
+++ b/examples/ted_en_zh/st1/run.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
 set -e
-source path.sh
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
 
 gpus=0,1,2,3
 stage=1
 stop_stage=4
 conf_path=conf/transformer_mtl_noam.yaml
-ckpt= # paddle.98 # (finetune from FAT-ST pretrained model)
+ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;

From aea1e92a3df7bad912f70ad84d953f02a43b8471 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Mon, 29 Nov 2021 22:50:34 -0800
Subject: [PATCH 18/53] update cmd.sh

---
 examples/ted_en_zh/st1/cmd.sh | 89 +++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 examples/ted_en_zh/st1/cmd.sh

diff --git a/examples/ted_en_zh/st1/cmd.sh b/examples/ted_en_zh/st1/cmd.sh
new file mode 100644
index 000000000..7b70ef5e0
--- /dev/null
+++ b/examples/ted_en_zh/st1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi

From c35457b80eaed086fc29926f1784451ac5a26345 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 30 Nov 2021 06:47:47 +0000
Subject: [PATCH 19/53] update tts_tutorial

---
 docs/tutorial/tts/tts_tutorial.ipynb | 68 ++++++++++++++--------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/docs/tutorial/tts/tts_tutorial.ipynb b/docs/tutorial/tts/tts_tutorial.ipynb
index 587924cda..2bb407beb 100644
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@@ -252,7 +252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 25,
    "metadata": {
     "scrolled": true
    },
@@ -261,8 +261,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The autoreload extension is already loaded. To reload it, use:\n",
-      "  %reload_ext autoreload\n"
+      "env: CUDA_VISIBLE_DEVICES=0\n"
      ]
     }
    ],
@@ -284,7 +283,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 28,
    "metadata": {
     "scrolled": true
    },
@@ -317,7 +316,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 30,
    "metadata": {
     "scrolled": true
    },
@@ -596,11 +595,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 31,
    "metadata": {
     "scrolled": true
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Frontend done!\n"
+     ]
+    }
+   ],
    "source": [
     "# 传入 phones_dict 会把相应的 phones 转换成 phone_ids\n",
     "frontend = Frontend(phone_vocab_path=phones_dict)\n",
@@ -619,25 +626,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 35,
    "metadata": {
     "scrolled": true
    },
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Building prefix dict from the default dictionary ...\n",
-      "DEBUG:jieba:Building prefix dict from the default dictionary ...\n",
-      "Loading model from cache /tmp/jieba.cache\n",
-      "DEBUG:jieba:Loading model from cache /tmp/jieba.cache\n",
-      "Loading model cost 5.331 seconds.\n",
-      "DEBUG:jieba:Loading model cost 5.331 seconds.\n",
-      "Prefix dict has been built successfully.\n",
-      "DEBUG:jieba:Prefix dict has been built successfully.\n"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -701,8 +694,10 @@
     "<br></br>\n",
     "在本教程中，我们使用 `FastSpeech2` 作为声学模型。\n",
     "![FastSpeech2](source/fastspeech2.png)\n",
+    "\n",
     "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)。\n",
     "![FastPitch](source/fastpitch.png)\n",
+    "\n",
     "更多关于[声学模型的发展及改进](https://paddlespeech.readthedocs.io/en/latest/tts/models_introduction.html)。"
    ]
   },
@@ -1020,13 +1015,16 @@
     "odim = fastspeech2_config.n_mels\n",
     "model = FastSpeech2(\n",
     "    idim=vocab_size, odim=odim, **fastspeech2_config[\"model\"])\n",
-    "\n",
-    "model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"]) # 加载预训练模型参数\n",
-    "model.eval() # 推理阶段不启用 batch norm 和 dropout\n",
+    "# 加载预训练模型参数\n",
+    "model.set_state_dict(paddle.load(fastspeech2_checkpoint)[\"main_params\"])\n",
+    "# 推理阶段不启用 batch norm 和 dropout\n",
+    "model.eval()\n",
     "stat = np.load(fastspeech2_stat)\n",
-    "mu, std = stat # 读取数据预处理阶段数据集的均值和标准差\n",
+    "# 读取数据预处理阶段数据集的均值和标准差\n",
+    "mu, std = stat\n",
     "mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
-    "fastspeech2_normalizer = ZScore(mu, std) # 构造归一化的新模型\n",
+    "# 构造归一化的新模型\n",
+    "fastspeech2_normalizer = ZScore(mu, std)\n",
     "fastspeech2_inference = FastSpeech2Inference(fastspeech2_normalizer, model)\n",
     "fastspeech2_inference.eval()\n",
     "print(fastspeech2_inference)\n",
@@ -1153,16 +1151,18 @@
    ],
    "source": [
     "vocoder = PWGGenerator(**pwg_config[\"generator_params\"])\n",
-    "\n",
-    "vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"]) # 模型加载预训练参数\n",
+    "# 模型加载预训练参数\n",
+    "vocoder.set_state_dict(paddle.load(pwg_checkpoint)[\"generator_params\"]) \n",
     "vocoder.remove_weight_norm()\n",
-    "vocoder.eval() # 推理阶段不启用 batch norm 和 dropout\n",
-    "\n",
-    "stat = np.load(pwg_stat) # 读取数据预处理阶段数据集的均值和标准差\n",
+    "# 推理阶段不启用 batch norm 和 dropout\n",
+    "vocoder.eval()\n",
+    "# 读取数据预处理阶段数据集的均值和标准差\n",
+    "stat = np.load(pwg_stat)\n",
     "mu, std = stat\n",
     "mu, std = paddle.to_tensor(mu), paddle.to_tensor(std)\n",
     "pwg_normalizer = ZScore(mu, std)\n",
-    "pwg_inference = PWGInference(pwg_normalizer, vocoder) # 构建归一化的模型\n",
+    "# 构建归一化的模型\n",
+    "pwg_inference = PWGInference(pwg_normalizer, vocoder)\n",
     "pwg_inference.eval()\n",
     "print(\"Parallel WaveGAN done!\")"
    ]
@@ -1266,7 +1266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {

From ad93edae8ef8d1309bcd2e1789a12b228743a99f Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 30 Nov 2021 07:25:39 +0000
Subject: [PATCH 20/53] add readme

---
 demos/README.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 demos/README.md

diff --git a/demos/README.md b/demos/README.md
new file mode 100644
index 000000000..2183c1f2d
--- /dev/null
+++ b/demos/README.md
@@ -0,0 +1 @@
+# Demos for PaddleSpeech

From cdc8520969bda11eb348f6784a93b607223db9d6 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 30 Nov 2021 07:32:23 +0000
Subject: [PATCH 21/53] add the infer

---
 paddlespeech/cli/s2t/infer.py | 239 ++++++++++++++++++++++++++++++----
 1 file changed, 212 insertions(+), 27 deletions(-)

diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/s2t/infer.py
index 6aa29addf..912d1df08 100644
--- a/paddlespeech/cli/s2t/infer.py
+++ b/paddlespeech/cli/s2t/infer.py
@@ -13,17 +13,24 @@
 # limitations under the License.
 import argparse
 import os
+import sys
 from typing import List
 from typing import Optional
 from typing import Union
 
+import soundfile
 import paddle
-
-from ..executor import BaseExecutor
-from ..utils import cli_register
-from ..utils import download_and_decompress
-from ..utils import logger
-from ..utils import MODEL_HOME
+from paddlespeech.cli.executor import BaseExecutor
+from paddlespeech.cli.utils import cli_register
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.cli.utils import logger
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.io.collator import SpeechCollator
+from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.s2t.utils.utility import UpdateConfig
 
 __all__ = ['S2TExecutor']
 
@@ -33,9 +40,44 @@ pretrained_models = {
         'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz',
         'md5':
         '54e7a558a6e020c2f5fb224874943f97',
+        'cfg_path':
+        'conf/conformer.yaml',
+        'ckpt_path':
+        'exp/conformer/checkpoints/wenetspeech',
     }
 }
 
+model_alias = {
+    "ds2_offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
+    "ds2_online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
+    "conformer": "paddlespeech.s2t.models.u2:U2Model",
+    "transformer": "paddlespeech.s2t.models.u2:U2Model",
+    "wenetspeech": "paddlespeech.s2t.models.u2:U2Model",
+}
+
+pretrain_model_alias = {
+    "ds2_online_zn": [
+        "https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz",
+        "", ""
+    ],
+    "ds2_offline_zn": [
+        "https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz",
+        "", ""
+    ],
+    "transformer_zn": [
+        "https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz",
+        "", ""
+    ],
+    "conformer_zn": [
+        "https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz",
+        "", ""
+    ],
+    "wenetspeech_zn": [
+        "https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz",
+        "conf/conformer.yaml", "exp/conformer/checkpoints/wenetspeech"
+    ],
+}
+
 
 @cli_register(
     name='paddlespeech.s2t', description='Speech to text infer command.')
@@ -63,7 +105,10 @@ class S2TExecutor(BaseExecutor):
             default=None,
             help='Checkpoint file of model.')
         self.parser.add_argument(
-            '--input', type=str, help='Audio file to recognize.')
+            '--input',
+            type=str,
+            default="../Downloads/asr-demo-1.wav",
+            help='Audio file to recognize.')
         self.parser.add_argument(
             '--device',
             type=str,
@@ -80,8 +125,10 @@ class S2TExecutor(BaseExecutor):
         res_path = os.path.join(MODEL_HOME, tag)
         decompressed_path = download_and_decompress(pretrained_models[tag],
                                                     res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
         logger.info(
             'Use pretrained model stored in: {}'.format(decompressed_path))
+
         return decompressed_path
 
     def _init_from_path(self,
@@ -93,56 +140,194 @@ class S2TExecutor(BaseExecutor):
             Init model and other resources from a specific path.
         """
         if cfg_path is None or ckpt_path is None:
-            res_path = self._get_pretrained_path(
-                model_type + '_' + lang)  # wenetspeech_zh
-            cfg_path = os.path.join(res_path, 'conf/conformer.yaml')
-            ckpt_path = os.path.join(
-                res_path, 'exp/conformer/checkpoints/wenetspeech.pdparams')
+            tag = model_type + '_' + lang
+            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(res_path,
+                                          pretrained_models[tag]['ckpt_path'])
             logger.info(res_path)
-            logger.info(cfg_path)
-            logger.info(ckpt_path)
+            logger.info(self.cfg_path)
+            logger.info(self.ckpt_path)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.ckpt_path = os.path.abspath(ckpt_path)
+            res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
 
-        # Init body.
-        pass
+        os.chdir(res_path)
+        #Init body.
+        parser_args = self.parser_args
+        paddle.set_device(parser_args.device)
+        self.config = get_cfg_defaults()
+        self.config.merge_from_file(self.cfg_path)
+        self.config.decoding.decoding_method = "attention_rescoring"
+        #self.config.freeze()
+        model_conf = self.config.model
+        logger.info(model_conf)
+
+        with UpdateConfig(model_conf):
+            if parser_args.model == "ds2_online" or parser_args.model == "ds2_offline":
+                self.config.collator.vocab_filepath = os.path.join(
+                    res_path, self.config.collator.vocab_filepath)
+                self.config.collator.vocab_filepath = os.path.join(
+                    res_path, self.config.collator.cmvn_path)
+                self.collate_fn_test = SpeechCollator.from_config(self.config)
+                model_conf.feat_size = self.collate_fn_test.feature_size
+                model_conf.dict_size = self.text_feature.vocab_size
+            elif parser_args.model == "conformer" or parser_args.model == "transformer" or parser_args.model == "wenetspeech":
+                self.config.collator.vocab_filepath = os.path.join(
+                    res_path, self.config.collator.vocab_filepath)
+                self.text_feature = TextFeaturizer(
+                    unit_type=self.config.collator.unit_type,
+                    vocab_filepath=self.config.collator.vocab_filepath,
+                    spm_model_prefix=self.config.collator.spm_model_prefix)
+                model_conf.input_dim = self.config.collator.feat_dim
+                model_conf.output_dim = self.text_feature.vocab_size
+            else:
+                raise Exception("wrong type")
+        model_class = dynamic_import(parser_args.model, model_alias)
+        model = model_class.from_config(model_conf)
+        self.model = model
+        self.model.eval()
+
+        # load model
+        params_path = self.ckpt_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
 
     def preprocess(self, input: Union[str, os.PathLike]):
         """
             Input preprocess and return paddle.Tensor stored in self.input.
             Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
         """
-        pass
+
+        parser_args = self.parser_args
+        config = self.config
+        audio_file = input
+        #print("audio_file", audio_file)
+        logger.info("audio_file"+ audio_file)
+
+        self.sr = config.collator.target_sample_rate
+
+        # Get the object for feature extraction
+        if parser_args.model == "ds2_online" or parser_args.model == "ds2_offline":
+            audio, _ = collate_fn_test.process_utterance(
+                audio_file=audio_file, transcript=" ")
+            audio_len = audio.shape[0]
+            audio = paddle.to_tensor(audio, dtype='float32')
+            self.audio_len = paddle.to_tensor(audio_len)
+            self.audio = paddle.unsqueeze(audio, axis=0)
+            self.vocab_list = collate_fn_test.vocab_list
+            logger.info(f"audio feat shape: {self.audio.shape}")
+
+        elif parser_args.model == "conformer" or parser_args.model == "transformer" or parser_args.model == "wenetspeech":
+            logger.info("get the preprocess conf")
+            preprocess_conf = os.path.join(
+                os.path.dirname(os.path.abspath(self.cfg_path)),
+                "preprocess.yaml")
+
+            cmvn_path: data / mean_std.json
+
+            logger.info(preprocess_conf)
+            preprocess_args = {"train": False}
+            preprocessing = Transformation(preprocess_conf)
+            audio, sample_rate = soundfile.read(
+                audio_file, dtype="int16", always_2d=True)
+            if sample_rate != self.sr:
+                logger.error(
+                    f"sample rate error: {sample_rate}, need {self.sr} ")
+                sys.exit(-1)
+            audio = audio[:, 0]
+            logger.info(f"audio shape: {audio.shape}")
+            # fbank
+            audio = preprocessing(audio, **preprocess_args)
+
+            self.audio_len = paddle.to_tensor(audio.shape[0])
+            self.audio = paddle.to_tensor(
+                audio, dtype='float32').unsqueeze(axis=0)
+            logger.info(f"audio feat shape: {self.audio.shape}")
+
+        else:
+            raise Exception("wrong type")
 
     @paddle.no_grad()
     def infer(self):
         """
             Model inference and result stored in self.output.
         """
+        cfg = self.config.decoding
+        parser_args = self.parser_args
+        audio = self.audio
+        audio_len = self.audio_len
+        if parser_args.model == "ds2_online" or parser_args.model == "ds2_offline":
+            vocab_list = self.vocab_list
+            result_transcripts = self.model.decode(
+                audio,
+                audio_len,
+                vocab_list,
+                decoding_method=cfg.decoding_method,
+                lang_model_path=cfg.lang_model_path,
+                beam_alpha=cfg.alpha,
+                beam_beta=cfg.beta,
+                beam_size=cfg.beam_size,
+                cutoff_prob=cfg.cutoff_prob,
+                cutoff_top_n=cfg.cutoff_top_n,
+                num_processes=cfg.num_proc_bsearch)
+            self.result_transcripts = result_transcripts[0]
+
+        elif parser_args.model == "conformer" or parser_args.model == "transformer" or parser_args.model == "wenetspeech":
+            text_feature = self.text_feature
+            result_transcripts = self.model.decode(
+                audio,
+                audio_len,
+                text_feature=self.text_feature,
+                decoding_method=cfg.decoding_method,
+                lang_model_path=cfg.lang_model_path,
+                beam_alpha=cfg.alpha,
+                beam_beta=cfg.beta,
+                beam_size=cfg.beam_size,
+                cutoff_prob=cfg.cutoff_prob,
+                cutoff_top_n=cfg.cutoff_top_n,
+                num_processes=cfg.num_proc_bsearch,
+                ctc_weight=cfg.ctc_weight,
+                decoding_chunk_size=cfg.decoding_chunk_size,
+                num_decoding_left_chunks=cfg.num_decoding_left_chunks,
+                simulate_streaming=cfg.simulate_streaming)
+            self.result_transcripts = result_transcripts[0][0]
+        else:
+            raise Exception("invalid model name")
+
         pass
 
     def postprocess(self) -> Union[str, os.PathLike]:
         """
             Output postprocess and return human-readable results such as texts and audio files.
         """
-        pass
+        return self.result_transcripts
 
     def execute(self, argv: List[str]) -> bool:
-        parser_args = self.parser.parse_args(argv)
-        print(parser_args)
+        self.parser_args = self.parser.parse_args(argv)
 
-        model = parser_args.model
-        lang = parser_args.lang
-        config = parser_args.config
-        ckpt_path = parser_args.ckpt_path
-        audio_file = parser_args.input
-        device = parser_args.device
+        model = self.parser_args.model
+        lang = self.parser_args.lang
+        config = self.parser_args.config
+        ckpt_path = self.parser_args.ckpt_path
+        audio_file = os.path.abspath(self.parser_args.input)
+        device = self.parser_args.device
 
         try:
             self._init_from_path(model, lang, config, ckpt_path)
             self.preprocess(audio_file)
             self.infer()
             res = self.postprocess()  # Retrieve result of s2t.
-            print(res)
+            logger.info(res)
             return True
         except Exception as e:
             print(e)
             return False
+
+
+if __name__ == "__main__":
+    exe = S2TExecutor()
+    exe.execute('')

From 7554b6107aa19d29195e8dc908c8bc89e208cdc3 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 30 Nov 2021 08:00:22 +0000
Subject: [PATCH 22/53] using visualdl; fix read_manifest

---
 paddlespeech/s2t/exps/u2/model.py             | 10 +++----
 paddlespeech/s2t/exps/u2_kaldi/model.py       | 10 +++----
 paddlespeech/s2t/exps/u2_st/model.py          | 10 +++----
 paddlespeech/s2t/frontend/normalizer.py       | 27 ++++++++++++++++---
 paddlespeech/s2t/frontend/utility.py          | 22 +++++++++++++--
 paddlespeech/s2t/io/dataset.py                |  2 +-
 paddlespeech/s2t/training/trainer.py          |  9 +++----
 .../training/trainer.py                       | 10 +++----
 requirements.txt                              |  1 -
 utils/build_vocab.py                          | 14 +++++++---
 utils/utility.py                              |  1 +
 11 files changed, 80 insertions(+), 36 deletions(-)

diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index b6dbcf443..5dbb72f45 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -128,8 +128,9 @@ class U2Trainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(tag='train/'+key, value=val, step=self.iteration-1)
+
 
     @paddle.no_grad()
     def valid(self):
@@ -237,9 +238,8 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index c23b4c245..a3f45d8e6 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -131,8 +131,8 @@ class U2Trainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -222,9 +222,9 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index 034463fea..771203cf9 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -138,8 +138,8 @@ class U2STTrainer(Trainer):
             if dist.get_rank() == 0 and self.visualizer:
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
-                self.visualizer.add_scalars("step", losses_np_v,
-                                            self.iteration - 1)
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -235,9 +235,9 @@ class U2STTrainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index a29cddc38..c55ec9a3d 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -16,19 +16,36 @@ import json
 
 import numpy as np
 import paddle
+import jsonlines
 from paddle.io import DataLoader
 from paddle.io import Dataset
 
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.utils.log import Log
 
 __all__ = ["FeatureNormalizer"]
 
 logger = Log(__name__).getlog()
 
-
+def read_manifest(manifest_path):
+     """Load and parse manifest file.
+ 
+     Args:
+         manifest_path ([type]): Manifest file to load and parse.
+     Raises:
+         IOError: If failed to parse the manifest.
+ 
+     Returns:
+         List[dict]: Manifest parsing results.
+     """
+ 
+     manifest = []
+     with jsonlines.open(manifest_path, 'r') as reader:
+         for json_data in reader:
+            manifest.append(json_data)
+     return manifest
+ 
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
     def __init__(self, feature_func):
@@ -61,7 +78,11 @@ class CollateFunc(object):
 class AudioDataset(Dataset):
     def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
         self._rng = rng if rng else np.random.RandomState(random_seed)
-        manifest = read_manifest(manifest_path)
+        manifest = []
+        with jsonlines.open(manifest_path, 'r') as reader:
+         for json_data in reader:
+            manifest.append(json_data)
+        
         if num_samples == -1:
             sampled_manifest = manifest
         else:
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index d423a6044..948aba065 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -65,7 +65,26 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
     return char_list
 
 
-def read_manifest(
+def read_manifest(manifest_path,):
+    """Load and parse manifest file.
+
+    Args:
+        manifest_path ([type]): Manifest file to load and parse.
+
+    Raises:
+        IOError: If failed to parse the manifest.
+
+    Returns:
+        List[dict]: Manifest parsing results.
+    """
+    manifest = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest.append(json_data)
+    return manifest
+
+    
+def read_manifest_filter(
         manifest_path,
         max_input_len=float('inf'),
         min_input_len=0.0,
@@ -98,7 +117,6 @@ def read_manifest(
     Returns:
         List[dict]: Manifest parsing results.
     """
-
     manifest = []
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 61eeb00f1..006cfe041 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -95,7 +95,7 @@ class ManifestDataset(Dataset):
         super().__init__()
 
         # read manifest
-        self._manifest = read_manifest(
+        self._manifest = read_manifest_filter(
             manifest_path=manifest_path,
             max_input_len=max_input_len,
             min_input_len=min_input_len,
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index f5fb2db03..be3988148 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -19,7 +19,7 @@ from pathlib import Path
 
 import paddle
 from paddle import distributed as dist
-from tensorboardX import SummaryWriter
+from visualdl import LogWriter
 
 from paddlespeech.s2t.training.reporter import ObsScope
 from paddlespeech.s2t.training.reporter import report
@@ -309,9 +309,8 @@ class Trainer():
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalars(
-                    'epoch', {'cv_loss': cv_loss,
-                              'lr': self.lr_scheduler()}, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             # after epoch
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
@@ -427,7 +426,7 @@ class Trainer():
         unexpected behaviors.
         """
         # visualizer
-        visualizer = SummaryWriter(logdir=str(self.visual_dir))
+        visualizer = LogWriter(logdir=str(self.visual_dir))
         self.visualizer = visualizer
 
     @mp_tools.rank_zero_only
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
index d6b6eeb65..ba7ddde39 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -34,7 +34,7 @@ from speechtask.punctuation_restoration.model.lstm import RnnLm
 from speechtask.punctuation_restoration.utils import layer_tools
 from speechtask.punctuation_restoration.utils import mp_tools
 from speechtask.punctuation_restoration.utils.checkpoint import Checkpoint
-from tensorboardX import SummaryWriter
+from visualdl import LogWriter
 
 __all__ = ["Trainer", "Tester"]
 
@@ -252,10 +252,8 @@ class Trainer():
             self.logger.info("Epoch {} Val info val_loss {}, F1_score {}".
                              format(self.epoch, total_loss, F1_score))
             if self.visualizer:
-                self.visualizer.add_scalars("epoch", {
-                    "total_loss": total_loss,
-                    "lr": self.lr_scheduler()
-                }, self.epoch)
+                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(
                 tag=self.epoch, infos={"val_loss": total_loss,
@@ -341,7 +339,7 @@ class Trainer():
         unexpected behaviors.
         """
         # visualizer
-        visualizer = SummaryWriter(logdir=str(self.output_dir))
+        visualizer = LogWriter(logdir=str(self.output_dir))
         self.visualizer = visualizer
 
     @mp_tools.rank_zero_only
diff --git a/requirements.txt b/requirements.txt
index 99e485f86..2ee60d3f6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -40,7 +40,6 @@ snakeviz
 soundfile~=0.10
 sox
 soxbindings
-tensorboardX
 textgrid
 timer
 tqdm
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index 6a9031475..61dc5e25f 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -19,11 +19,11 @@ import argparse
 import functools
 import os
 import tempfile
+import jsonlines
 from collections import Counter
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import BLANK
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.frontend.utility import SOS
 from paddlespeech.s2t.frontend.utility import SPACE
 from paddlespeech.s2t.frontend.utility import UNK
@@ -59,13 +59,21 @@ args = parser.parse_args()
 
 
 def count_manifest(counter, text_feature, manifest_path):
-    manifest_jsons = read_manifest(manifest_path)
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+        
     for line_json in manifest_jsons:
         line = text_feature.tokenize(line_json['text'], replace_space=False)
         counter.update(line)
 
 def dump_text_manifest(fileobj, manifest_path, key='text'):
-    manifest_jsons = read_manifest(manifest_path)
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+            
     for line_json in manifest_jsons:
         fileobj.write(line_json[key] + "\n")
 
diff --git a/utils/utility.py b/utils/utility.py
index b4db518a4..29fda2685 100755
--- a/utils/utility.py
+++ b/utils/utility.py
@@ -42,6 +42,7 @@ def read_manifest(manifest_path):
     for json_line in open(manifest_path, 'r'):
         try:
             json_data = json.loads(json_line)
+            manifest.append(json_data)
         except Exception as e:
             raise IOError("Error reading manifest: %s" % str(e))
     return manifest

From d395c2b8e34cede258b8d070271d2e8aa983ded5 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 30 Nov 2021 08:10:51 +0000
Subject: [PATCH 23/53] jsonlines reade manifest file

---
 .../frontend/augmentor/impulse_response.py    |  5 ++--
 .../s2t/frontend/augmentor/noise_perturb.py   |  5 ++--
 paddlespeech/s2t/frontend/normalizer.py       | 26 +++----------------
 paddlespeech/s2t/frontend/utility.py          | 21 +--------------
 paddlespeech/s2t/io/dataloader.py             |  6 +++--
 paddlespeech/s2t/io/dataset.py                |  7 ++---
 paddlespeech/s2t/utils/socket_server.py       |  6 ++---
 utils/dump_manifest.py                        |  8 +++---
 utils/format_data.py                          |  6 +++--
 utils/format_triplet_data.py                  |  5 ++--
 utils/manifest_key_value.py                   |  5 ++--
 utils/utility.py                              | 24 +----------------
 12 files changed, 37 insertions(+), 87 deletions(-)

diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
index 6cc9c0d43..1a82bb923 100644
--- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py
+++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the impulse response augmentation model."""
+import jsonlines
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
-from paddlespeech.s2t.frontend.utility import read_manifest
 
 
 class ImpulseResponseAugmentor(AugmentorBase):
@@ -28,7 +28,8 @@ class ImpulseResponseAugmentor(AugmentorBase):
 
     def __init__(self, rng, impulse_manifest_path):
         self._rng = rng
-        self._impulse_manifest = read_manifest(impulse_manifest_path)
+        with jsonlines.open(impulse_manifest_path, 'r') as reader:
+            self._impulse_manifest = list(reader)
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
index 9d6da1a8f..ce0a88186 100644
--- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the noise perturb augmentation model."""
+import jsonlines
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
-from paddlespeech.s2t.frontend.utility import read_manifest
 
 
 class NoisePerturbAugmentor(AugmentorBase):
@@ -34,7 +34,8 @@ class NoisePerturbAugmentor(AugmentorBase):
         self._min_snr_dB = min_snr_dB
         self._max_snr_dB = max_snr_dB
         self._rng = rng
-        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)
+        with jsonlines.open(noise_manifest_path, 'r') as reader:
+            self._noise_manifest = list(reader)
 
     def __call__(self, x, uttid=None, train=True):
         if not train:
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index c55ec9a3d..0a634fc14 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -13,10 +13,9 @@
 # limitations under the License.
 """Contains feature normalizers."""
 import json
-
+import jsonlines
 import numpy as np
 import paddle
-import jsonlines
 from paddle.io import DataLoader
 from paddle.io import Dataset
 
@@ -27,24 +26,6 @@ from paddlespeech.s2t.utils.log import Log
 __all__ = ["FeatureNormalizer"]
 
 logger = Log(__name__).getlog()
-
-def read_manifest(manifest_path):
-     """Load and parse manifest file.
- 
-     Args:
-         manifest_path ([type]): Manifest file to load and parse.
-     Raises:
-         IOError: If failed to parse the manifest.
- 
-     Returns:
-         List[dict]: Manifest parsing results.
-     """
- 
-     manifest = []
-     with jsonlines.open(manifest_path, 'r') as reader:
-         for json_data in reader:
-            manifest.append(json_data)
-     return manifest
  
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
@@ -78,10 +59,9 @@ class CollateFunc(object):
 class AudioDataset(Dataset):
     def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0):
         self._rng = rng if rng else np.random.RandomState(random_seed)
-        manifest = []
+
         with jsonlines.open(manifest_path, 'r') as reader:
-         for json_data in reader:
-            manifest.append(json_data)
+            manifest = list(reader)
         
         if num_samples == -1:
             sampled_manifest = manifest
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 948aba065..ccb767adc 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -64,27 +64,8 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
         char_list.append(MASKCTC)
     return char_list
 
-
-def read_manifest(manifest_path,):
-    """Load and parse manifest file.
-
-    Args:
-        manifest_path ([type]): Manifest file to load and parse.
-
-    Raises:
-        IOError: If failed to parse the manifest.
-
-    Returns:
-        List[dict]: Manifest parsing results.
-    """
-    manifest = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest.append(json_data)
-    return manifest
-
     
-def read_manifest_filter(
+def read_manifest(
         manifest_path,
         max_input_len=float('inf'),
         min_input_len=0.0,
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 3b5000a28..bda48842a 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -15,11 +15,11 @@ from typing import Any
 from typing import Dict
 from typing import List
 from typing import Text
+import jsonlines
 
 import numpy as np
 from paddle.io import DataLoader
 
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.batchfy import make_batchset
 from paddlespeech.s2t.io.converter import CustomConverter
 from paddlespeech.s2t.io.dataset import TransformDataset
@@ -91,7 +91,9 @@ class BatchDataLoader():
         self.n_iter_processes = n_iter_processes
 
         # read json data
-        self.data_json = read_manifest(json_file)
+        with jsonlines.open(json_file, 'r') as reader:
+            self.data_json = list(reader)
+            
         self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
             self.data_json, mode='asr')
 
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 006cfe041..ba10aebbb 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -14,7 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 from typing import Optional
-
+import jsonlines
 from paddle.io import Dataset
 from yacs.config import CfgNode
 
@@ -95,7 +95,7 @@ class ManifestDataset(Dataset):
         super().__init__()
 
         # read manifest
-        self._manifest = read_manifest_filter(
+        self._manifest = read_manifest(
             manifest_path=manifest_path,
             max_input_len=max_input_len,
             min_input_len=min_input_len,
@@ -184,7 +184,8 @@ class AudioDataset(Dataset):
         """
         assert batch_type in ['static', 'dynamic']
         # read manifest
-        data = read_manifest(data_file)
+        with jsonlines.open(data_file, 'r') as reader:
+            data = list(reader)
         if sort:
             data = sorted(data, key=lambda x: x["feat_shape"][0])
         if raw_wav:
diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py
index 43b56d723..6371ba85e 100644
--- a/paddlespeech/s2t/utils/socket_server.py
+++ b/paddlespeech/s2t/utils/socket_server.py
@@ -20,8 +20,7 @@ import time
 import wave
 from time import gmtime
 from time import strftime
-
-from paddlespeech.s2t.frontend.utility import read_manifest
+import jsonlines
 
 __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
 
@@ -44,7 +43,8 @@ def warm_up_test(audio_process_handler,
                  num_test_cases,
                  random_seed=0):
     """Warming-up test."""
-    manifest = read_manifest(manifest_path)
+    with jsonlines.open(manifest_path) as reader:
+        manifest = list(reader)
     rng = random.Random(random_seed)
     samples = rng.sample(manifest, num_test_cases)
     for idx, sample in enumerate(samples):
diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py
index b5f7b64a4..d602571d5 100755
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@@ -16,8 +16,7 @@
 import argparse
 from pathlib import Path
 from typing import Union
-
-from paddlespeech.s2t.frontend.utility import read_manifest
+import jsonlines
 
 key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
 filename = {
@@ -32,7 +31,10 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):
 
     output_dir = Path(output_dir).expanduser()
     manifest_path = Path(manifest_path).expanduser()
-    manifest_jsons = read_manifest(manifest_path)
+
+    with jsonlines.open(str(manifest_path), 'r') as reader:
+        manifest_jsons = list(reader)
+        
     first_line = manifest_jsons[0]
     file_map = {}
 
diff --git a/utils/format_data.py b/utils/format_data.py
index 2fa1924a0..437d7e0f0 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -15,11 +15,11 @@
 """format manifest with more metadata."""
 import argparse
 import functools
+import jsonlines
 import json
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -71,7 +71,9 @@ def main():
     # }
     count = 0
     for manifest_path in args.manifest_paths:
-        manifest_jsons = read_manifest(manifest_path)
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
+        
         for line_json in manifest_jsons:
             output_json = {
                 "input": [],
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index e0b5ece37..dd9dab42c 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -16,10 +16,10 @@
 import argparse
 import functools
 import json
+import jsonlines
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.frontend.utility import read_manifest
 from paddlespeech.s2t.io.utility import feat_type
 from paddlespeech.s2t.utils.utility import add_arguments
 from paddlespeech.s2t.utils.utility import print_arguments
@@ -63,7 +63,8 @@ def main():
 
     count = 0
     for manifest_path in args.manifest_paths:
-        manifest_jsons = read_manifest(manifest_path)
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
         for line_json in manifest_jsons:
             # text: translation text, text1: transcript text.
             # Currently only support joint-vocab, will add separate vocabs setting.
diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py
index b409236fc..0cfb2450e 100755
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -3,10 +3,10 @@
 import argparse
 import functools
 from pathlib import Path
+import jsonlines
 
 from utils.utility import add_arguments
 from utils.utility import print_arguments
-from utils.utility import read_manifest
 
 
 def main(args):
@@ -19,7 +19,8 @@ def main(args):
     dur_scp = outdir / 'duration'
     text_scp = outdir / 'text'
 
-    manifest_jsons = read_manifest(args.manifest_path)
+    with jsonlines.open(args.manifest_path, 'r') as reader:
+        manifest_jsons = list(reader)
 
     with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open(
             'w') as ftxt:
diff --git a/utils/utility.py b/utils/utility.py
index 29fda2685..b3523b383 100755
--- a/utils/utility.py
+++ b/utils/utility.py
@@ -22,32 +22,10 @@ from typing import Text
 __all__ = [
     "check_md5sum", "getfile_insensitive", "download_multi", "download",
     "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "read_manifest", "get_commandline_args"
+    "get_commandline_args"
 ]
 
 
-def read_manifest(manifest_path):
-    """Load and parse manifest file.
-    Args:
-        manifest_path ([type]): Manifest file to load and parse.
-
-    Raises:
-        IOError: If failed to parse the manifest.
-
-    Returns:
-        List[dict]: Manifest parsing results.
-    """
-
-    manifest = []
-    for json_line in open(manifest_path, 'r'):
-        try:
-            json_data = json.loads(json_line)
-            manifest.append(json_data)
-        except Exception as e:
-            raise IOError("Error reading manifest: %s" % str(e))
-    return manifest
-
-
 def get_commandline_args():
     extra_chars = [
         " ",

From 39228864bb1b4995de464d57b641ab43a247d9c7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 30 Nov 2021 08:18:13 +0000
Subject: [PATCH 24/53] format code

---
 examples/aishell/asr1/READEME.md                      |  3 ---
 paddlespeech/s2t/exps/u2/model.py                     | 10 ++++++----
 paddlespeech/s2t/exps/u2_kaldi/model.py               | 11 +++++++----
 paddlespeech/s2t/exps/u2_st/model.py                  | 11 +++++++----
 .../s2t/frontend/augmentor/impulse_response.py        |  1 +
 paddlespeech/s2t/frontend/augmentor/noise_perturb.py  |  1 +
 paddlespeech/s2t/frontend/normalizer.py               |  6 ++++--
 paddlespeech/s2t/frontend/utility.py                  |  2 +-
 paddlespeech/s2t/io/dataloader.py                     |  4 ++--
 paddlespeech/s2t/io/dataset.py                        |  1 +
 paddlespeech/s2t/io/sampler.py                        |  2 +-
 paddlespeech/s2t/training/trainer.py                  |  6 ++++--
 paddlespeech/s2t/utils/socket_server.py               |  1 +
 .../punctuation_restoration/training/trainer.py       |  6 ++++--
 utils/build_vocab.py                                  |  7 ++++---
 utils/dump_manifest.py                                |  3 ++-
 utils/format_data.py                                  |  5 +++--
 utils/format_triplet_data.py                          |  1 +
 utils/manifest_key_value.py                           |  1 +
 utils/utility.py                                      |  1 -
 20 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/examples/aishell/asr1/READEME.md b/examples/aishell/asr1/READEME.md
index e9fd3017c..2eea233da 100644
--- a/examples/aishell/asr1/READEME.md
+++ b/examples/aishell/asr1/READEME.md
@@ -339,6 +339,3 @@ You need to prepare an audio file, please confirm the sample rate of the audio i
 ```bash
 CUDA_VISIBLE_DEVICES= ./local/test_hub.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20 data/test_audio.wav
 ```
-
-
-
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 5dbb72f45..d448021cb 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -129,8 +129,8 @@ class U2Trainer(Trainer):
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
                 for key, val in losses_np_v.items():
-                    self.visualizer.add_scalar(tag='train/'+key, value=val, step=self.iteration-1)
-
+                    self.visualizer.add_scalar(
+                        tag='train/' + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -238,8 +238,10 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
diff --git a/paddlespeech/s2t/exps/u2_kaldi/model.py b/paddlespeech/s2t/exps/u2_kaldi/model.py
index a3f45d8e6..43e31a60d 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
@@ -132,7 +132,8 @@ class U2Trainer(Trainer):
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
                 for key, val in losses_np_v.items():
-                    self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
+                    self.visualizer.add_scalar(
+                        tag="train/" + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -222,9 +223,11 @@ class U2Trainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
-                
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index 771203cf9..2dbbdcd30 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -139,7 +139,8 @@ class U2STTrainer(Trainer):
                 losses_np_v = losses_np.copy()
                 losses_np_v.update({"lr": self.lr_scheduler()})
                 for key, val in losses_np_v.items():
-                    self.visualizer.add_scalar(tag="train/"+key, value=val, step=self.iteration - 1)
+                    self.visualizer.add_scalar(
+                        tag="train/" + key, value=val, step=self.iteration - 1)
 
     @paddle.no_grad()
     def valid(self):
@@ -235,9 +236,11 @@ class U2STTrainer(Trainer):
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
-                
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
             self.new_epoch()
 
diff --git a/paddlespeech/s2t/frontend/augmentor/impulse_response.py b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
index 1a82bb923..5ba45bb20 100644
--- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py
+++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Contains the impulse response augmentation model."""
 import jsonlines
+
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
 
diff --git a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
index ce0a88186..71165dac8 100644
--- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Contains the noise perturb augmentation model."""
 import jsonlines
+
 from paddlespeech.s2t.frontend.audio import AudioSegment
 from paddlespeech.s2t.frontend.augmentor.base import AugmentorBase
 
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index 0a634fc14..017851e63 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Contains feature normalizers."""
 import json
+
 import jsonlines
 import numpy as np
 import paddle
@@ -26,7 +27,8 @@ from paddlespeech.s2t.utils.log import Log
 __all__ = ["FeatureNormalizer"]
 
 logger = Log(__name__).getlog()
- 
+
+
 # https://github.com/PaddlePaddle/Paddle/pull/31481
 class CollateFunc(object):
     def __init__(self, feature_func):
@@ -62,7 +64,7 @@ class AudioDataset(Dataset):
 
         with jsonlines.open(manifest_path, 'r') as reader:
             manifest = list(reader)
-        
+
         if num_samples == -1:
             sampled_manifest = manifest
         else:
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index ccb767adc..175727e17 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -64,7 +64,7 @@ def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
         char_list.append(MASKCTC)
     return char_list
 
-    
+
 def read_manifest(
         manifest_path,
         max_input_len=float('inf'),
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index bda48842a..b8eb33679 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -15,8 +15,8 @@ from typing import Any
 from typing import Dict
 from typing import List
 from typing import Text
-import jsonlines
 
+import jsonlines
 import numpy as np
 from paddle.io import DataLoader
 
@@ -93,7 +93,7 @@ class BatchDataLoader():
         # read json data
         with jsonlines.open(json_file, 'r') as reader:
             self.data_json = list(reader)
-            
+
         self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
             self.data_json, mode='asr')
 
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index ba10aebbb..d64d7d3ec 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -14,6 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 from typing import Optional
+
 import jsonlines
 from paddle.io import Dataset
 from yacs.config import CfgNode
diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py
index 0d5a16ce1..35b57524b 100644
--- a/paddlespeech/s2t/io/sampler.py
+++ b/paddlespeech/s2t/io/sampler.py
@@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
     """
     rng = np.random.RandomState(epoch)
     shift_len = rng.randint(0, batch_size - 1)
-    batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
+    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
     rng.shuffle(batch_indices)
     batch_indices = [item for batch in batch_indices for item in batch]
     assert clipped is False
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index be3988148..f0099f109 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -309,8 +309,10 @@ class Trainer():
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             # after epoch
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
diff --git a/paddlespeech/s2t/utils/socket_server.py b/paddlespeech/s2t/utils/socket_server.py
index 6371ba85e..691ea9668 100644
--- a/paddlespeech/s2t/utils/socket_server.py
+++ b/paddlespeech/s2t/utils/socket_server.py
@@ -20,6 +20,7 @@ import time
 import wave
 from time import gmtime
 from time import strftime
+
 import jsonlines
 
 __all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"]
diff --git a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
index ba7ddde39..78512796b 100644
--- a/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/training/trainer.py
@@ -252,8 +252,10 @@ class Trainer():
             self.logger.info("Epoch {} Val info val_loss {}, F1_score {}".
                              format(self.epoch, total_loss, F1_score))
             if self.visualizer:
-                self.visualizer.add_scalar(tag='eval/cv_loss', value=cv_loss, step=self.epoch)
-                self.visualizer.add_scalar(tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/lr', value=self.lr_scheduler(), step=self.epoch)
 
             self.save(
                 tag=self.epoch, infos={"val_loss": total_loss,
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index 61dc5e25f..f832cbbc3 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -19,9 +19,10 @@ import argparse
 import functools
 import os
 import tempfile
-import jsonlines
 from collections import Counter
 
+import jsonlines
+
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import BLANK
 from paddlespeech.s2t.frontend.utility import SOS
@@ -63,7 +64,7 @@ def count_manifest(counter, text_feature, manifest_path):
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
             manifest_jsons.append(json_data)
-        
+
     for line_json in manifest_jsons:
         line = text_feature.tokenize(line_json['text'], replace_space=False)
         counter.update(line)
@@ -73,7 +74,7 @@ def dump_text_manifest(fileobj, manifest_path, key='text'):
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
             manifest_jsons.append(json_data)
-            
+
     for line_json in manifest_jsons:
         fileobj.write(line_json[key] + "\n")
 
diff --git a/utils/dump_manifest.py b/utils/dump_manifest.py
index d602571d5..58d917558 100755
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
@@ -16,6 +16,7 @@
 import argparse
 from pathlib import Path
 from typing import Union
+
 import jsonlines
 
 key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
@@ -34,7 +35,7 @@ def dump_manifest(manifest_path, output_dir: Union[str, Path]):
 
     with jsonlines.open(str(manifest_path), 'r') as reader:
         manifest_jsons = list(reader)
-        
+
     first_line = manifest_jsons[0]
     file_map = {}
 
diff --git a/utils/format_data.py b/utils/format_data.py
index 437d7e0f0..6db2a1bbb 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -15,9 +15,10 @@
 """format manifest with more metadata."""
 import argparse
 import functools
-import jsonlines
 import json
 
+import jsonlines
+
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.io.utility import feat_type
@@ -73,7 +74,7 @@ def main():
     for manifest_path in args.manifest_paths:
         with jsonlines.open(str(manifest_path), 'r') as reader:
             manifest_jsons = list(reader)
-        
+
         for line_json in manifest_jsons:
             output_json = {
                 "input": [],
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index dd9dab42c..44ff4527c 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -16,6 +16,7 @@
 import argparse
 import functools
 import json
+
 import jsonlines
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py
index 0cfb2450e..fb3d3aaaf 100755
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -3,6 +3,7 @@
 import argparse
 import functools
 from pathlib import Path
+
 import jsonlines
 
 from utils.utility import add_arguments
diff --git a/utils/utility.py b/utils/utility.py
index b3523b383..dbf8b1d7f 100755
--- a/utils/utility.py
+++ b/utils/utility.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import hashlib
-import json
 import os
 import sys
 import tarfile

From 5a4c778721aa8881317ad7cb514ee8021666f00c Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 30 Nov 2021 08:35:15 +0000
Subject: [PATCH 25/53] update link

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 532b24cc3..da413001a 100644
--- a/README.md
+++ b/README.md
@@ -110,7 +110,7 @@ Developers can have a try of our model with only a few lines of code.
 A tiny DeepSpeech2 **Speech-to-Text** model training on toy set of LibriSpeech:
 
 ```shell
-cd examples/tiny/s0/
+cd examples/tiny/asr0/
 # source the environment
 source path.sh
 source ../../../utils/parse_options.sh
@@ -177,20 +177,20 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
       <td rowspan="2" >Aishell</td>
       <td >DeepSpeech2 RNN + Conv based Models</td>
       <td>
-      <a href = "./examples/aishell/s0">deepspeech2-aishell</a>
+      <a href = "./examples/aishell/asr0">deepspeech2-aishell</a>
       </td>
     </tr>
     <tr>
       <td>Transformer based Attention Models </td>
       <td>
-      <a href = "./examples/aishell/s1">u2.transformer.conformer-aishell</a>
+      <a href = "./examples/aishell/asr1">u2.transformer.conformer-aishell</a>
       </td>
     </tr>
       <tr>
       <td> Librispeech</td>
       <td>Transformer based Attention Models </td>
       <td>
-      <a href = "./examples/librispeech/s0">deepspeech2-librispeech</a> / <a href = "./examples/librispeech/s1">transformer.conformer.u2-librispeech</a>  / <a href = "./examples/librispeech/s2">transformer.conformer.u2-kaldi-librispeech</a>
+      <a href = "./examples/librispeech/asr0">deepspeech2-librispeech</a> / <a href = "./examples/librispeech/asr1">transformer.conformer.u2-librispeech</a>  / <a href = "./examples/librispeech/asr2">transformer.conformer.u2-kaldi-librispeech</a>
       </td>
       </td>
     </tr>
@@ -199,7 +199,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
   <td>THCHS30</td>
   <td>MFA</td>
   <td>
-  <a href = ".examples/thchs30/a0">mfa-thchs30</a>
+  <a href = ".examples/thchs30/align0">mfa-thchs30</a>
   </td>
   </tr>
    <tr>
@@ -213,7 +213,7 @@ The current hyperlinks redirect to [Previous Parakeet](https://github.com/Paddle
       <td>TIMIT</td>
       <td>Unified Streaming & Non-streaming Two-pass</td>
       <td>
-    <a href = "./examples/timit/s1"> u2-timit</a>
+    <a href = "./examples/timit/asr1"> u2-timit</a>
       </td>
     </tr>
   </tbody>

From 000294132cd2e37d04cb09d68450bdad9494ac5f Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Tue, 30 Nov 2021 17:55:44 +0800
Subject: [PATCH 26/53] Rename s2t to asr.

---
 paddlespeech/cli/README.md                |  4 ++--
 paddlespeech/cli/__init__.py              |  2 +-
 paddlespeech/cli/{s2t => asr}/__init__.py |  2 +-
 paddlespeech/cli/{s2t => asr}/infer.py    | 26 ++++++++++-------------
 paddlespeech/cli/entry.py                 |  5 ++++-
 paddlespeech/cli/executor.py              |  2 +-
 paddlespeech/cli/{t2s => tts}/__init.__py |  0
 7 files changed, 20 insertions(+), 21 deletions(-)
 rename paddlespeech/cli/{s2t => asr}/__init__.py (95%)
 rename paddlespeech/cli/{s2t => asr}/infer.py (95%)
 rename paddlespeech/cli/{t2s => tts}/__init.__py (100%)

diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md
index 4cea85b14..bd6572f19 100644
--- a/paddlespeech/cli/README.md
+++ b/paddlespeech/cli/README.md
@@ -5,5 +5,5 @@
  ## Help
  `paddlespeech help`
 
- ## S2T
- `paddlespeech s2t --config ./s2t.yaml --input ./zh.wav --device gpu`
+ ## ASR
+ `paddlespeech asr --input ./test_audio.wav --device gpu`
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index 1cc7e27f5..7e0329041 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -11,6 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .asr import ASRExecutor
 from .base_commands import BaseCommand
 from .base_commands import HelpCommand
-from .s2t import S2TExecutor
diff --git a/paddlespeech/cli/s2t/__init__.py b/paddlespeech/cli/asr/__init__.py
similarity index 95%
rename from paddlespeech/cli/s2t/__init__.py
rename to paddlespeech/cli/asr/__init__.py
index 57e814b9e..8ab0991fc 100644
--- a/paddlespeech/cli/s2t/__init__.py
+++ b/paddlespeech/cli/asr/__init__.py
@@ -11,4 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .infer import S2TExecutor
+from .infer import ASRExecutor
diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/asr/infer.py
similarity index 95%
rename from paddlespeech/cli/s2t/infer.py
rename to paddlespeech/cli/asr/infer.py
index b3507cb60..605163803 100644
--- a/paddlespeech/cli/s2t/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -33,7 +33,7 @@ from paddlespeech.s2t.transform.transformation import Transformation
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.utility import UpdateConfig
 
-__all__ = ['S2TExecutor']
+__all__ = ['ASRExecutor']
 
 pretrained_models = {
     "wenetspeech_zh": {
@@ -58,13 +58,15 @@ model_alias = {
 
 
 @cli_register(
-    name='paddlespeech.s2t', description='Speech to text infer command.')
-class S2TExecutor(BaseExecutor):
+    name='paddlespeech.asr', description='Speech to text infer command.')
+class ASRExecutor(BaseExecutor):
     def __init__(self):
-        super(S2TExecutor, self).__init__()
+        super(ASRExecutor, self).__init__()
 
         self.parser = argparse.ArgumentParser(
-            prog='paddlespeech.s2t', add_help=True)
+            prog='paddlespeech.asr', add_help=True)
+        self.parser.add_argument(
+            '--input', type=str, required=True, help='Audio file to recognize.')
         self.parser.add_argument(
             '--model',
             type=str,
@@ -76,16 +78,12 @@ class S2TExecutor(BaseExecutor):
             '--config',
             type=str,
             default=None,
-            help='Config of s2t task. Use deault config when it is None.')
+            help='Config of asr task. Use deault config when it is None.')
         self.parser.add_argument(
             '--ckpt_path',
             type=str,
             default=None,
             help='Checkpoint file of model.')
-        self.parser.add_argument(
-            '--input',
-            type=str,
-            help='Audio file to recognize.')
         self.parser.add_argument(
             '--device',
             type=str,
@@ -178,13 +176,12 @@ class S2TExecutor(BaseExecutor):
     def preprocess(self, input: Union[str, os.PathLike]):
         """
             Input preprocess and return paddle.Tensor stored in self.input.
-            Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
+            Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
         """
 
         parser_args = self.parser_args
         config = self.config
         audio_file = input
-        #print("audio_file", audio_file)
         logger.info("audio_file" + audio_file)
 
         self.sr = config.collator.target_sample_rate
@@ -290,7 +287,6 @@ class S2TExecutor(BaseExecutor):
             Command line entry.
         """
         self.parser_args = self.parser.parse_args(argv)
-        print(self.parser_args)
 
         model = self.parser_args.model
         lang = self.parser_args.lang
@@ -301,7 +297,7 @@ class S2TExecutor(BaseExecutor):
 
         try:
             res = self(model, lang, config, ckpt_path, audio_file, device)
-            print(res)
+            logger.info('ASR Result: {}'.format(res))
             return True
         except Exception as e:
             print(e)
@@ -314,6 +310,6 @@ class S2TExecutor(BaseExecutor):
         self._init_from_path(model, lang, config, ckpt_path)
         self.preprocess(audio_file)
         self.infer()
-        res = self.postprocess()  # Retrieve result of s2t.
+        res = self.postprocess()  # Retrieve result of asr.
 
         return res
diff --git a/paddlespeech/cli/entry.py b/paddlespeech/cli/entry.py
index 726cff1af..32123ece7 100644
--- a/paddlespeech/cli/entry.py
+++ b/paddlespeech/cli/entry.py
@@ -23,9 +23,12 @@ def _CommandDict():
 
 def _execute():
     com = commands
-    for idx, _argv in enumerate(['paddlespeech'] + sys.argv[1:]):
+
+    idx = 0
+    for _argv in (['paddlespeech'] + sys.argv[1:]):
         if _argv not in com:
             break
+        idx += 1
         com = com[_argv]
 
     # The method 'execute' of a command instance returns 'True' for a success
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index 2314bd6d3..e307a287b 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -47,7 +47,7 @@ class BaseExecutor(ABC):
     def preprocess(self, input: Union[str, os.PathLike]):
         """
             Input preprocess and return paddle.Tensor stored in self.input.
-            Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
+            Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
         """
         pass
 
diff --git a/paddlespeech/cli/t2s/__init.__py b/paddlespeech/cli/tts/__init.__py
similarity index 100%
rename from paddlespeech/cli/t2s/__init.__py
rename to paddlespeech/cli/tts/__init.__py

From 17072444726509b4653e3c5b7eaf40490d85e4f6 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Tue, 30 Nov 2021 18:14:07 +0800
Subject: [PATCH 27/53] Update device usage.

---
 paddlespeech/cli/README.md    | 2 +-
 paddlespeech/cli/asr/infer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md
index bd6572f19..56afb939c 100644
--- a/paddlespeech/cli/README.md
+++ b/paddlespeech/cli/README.md
@@ -6,4 +6,4 @@
  `paddlespeech help`
 
  ## ASR
- `paddlespeech asr --input ./test_audio.wav --device gpu`
+ `paddlespeech asr --input ./test_audio.wav`
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 605163803..e5c64e9ab 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -87,7 +87,7 @@ class ASRExecutor(BaseExecutor):
         self.parser.add_argument(
             '--device',
             type=str,
-            default='cpu',
+            default=paddle.get_device(),
             help='Choose device to execute model inference.')
 
     def _get_pretrained_path(self, tag: str) -> os.PathLike:

From 13411d8a26c110d3db56bea9d78ebe98c4d85cd8 Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Tue, 30 Nov 2021 18:23:45 +0800
Subject: [PATCH 28/53] fix readme typo

---
 examples/aishell3/vc1/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/aishell3/vc1/README.md b/examples/aishell3/vc1/README.md
index 974b84cad..676678c99 100644
--- a/examples/aishell3/vc1/README.md
+++ b/examples/aishell3/vc1/README.md
@@ -21,7 +21,7 @@ We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner)
 You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your own MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
 
 ## Pretrained GE2E Model
-We use pretrained GE2E model to generate spwaker embedding for each sentence.
+We use pretrained GE2E model to generate speaker embedding for each sentence.
 
 Download pretrained GE2E model from here [ge2e_ckpt_0.3.zip](https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip), and `unzip` it.
 

From ba116f2ff6a1cf471b36e2433c188e0824cf1f47 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 30 Nov 2021 11:45:48 +0000
Subject: [PATCH 29/53] add readme

---
 demos/style_fs2/README.md | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 demos/style_fs2/README.md

diff --git a/demos/style_fs2/README.md b/demos/style_fs2/README.md
new file mode 100644
index 000000000..b9b0469f8
--- /dev/null
+++ b/demos/style_fs2/README.md
@@ -0,0 +1,8 @@
+# Style FastSpeech2
+You can change the `pitch`、`duration` and `energy` of `FastSpeech2`.
+
+Run the following command line to get started:
+```
+./run.sh
+```
+For more details, please see `style_syn.py`

From d0bf506feecbcf394d00920b42e8674bb0539c37 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 30 Nov 2021 12:21:25 +0000
Subject: [PATCH 30/53] fix the load checkpoint

---
 paddlespeech/s2t/utils/checkpoint.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/paddlespeech/s2t/utils/checkpoint.py b/paddlespeech/s2t/utils/checkpoint.py
index 4c493715a..1d24c880a 100644
--- a/paddlespeech/s2t/utils/checkpoint.py
+++ b/paddlespeech/s2t/utils/checkpoint.py
@@ -94,10 +94,7 @@ class Checkpoint():
         """
         configs = {}
 
-        if len(checkpoint_path) == 0 or checkpoint_path == "None":
-            checkpoint_path = None
-
-        if checkpoint_path is not None:
+        if checkpoint_path:
             pass
         elif checkpoint_dir is not None and record_file is not None:
             # load checkpint from record file

From cb383a39c3d1e8b5c79609f1b5802cd3a3470b31 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 30 Nov 2021 11:07:13 +0000
Subject: [PATCH 31/53] fix the benchmark

---
 paddlespeech/s2t/training/trainer.py       |  3 ++-
 tests/benchmark/conformer/prepare.sh       |  2 +-
 tests/benchmark/conformer/run.sh           |  2 +-
 tests/benchmark/conformer/run_benchmark.sh | 17 +++++++++--------
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index f5fb2db03..752208c32 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -245,8 +245,9 @@ class Trainer():
         self.maybe_batch_sampler_step()
 
     def after_train_batch(self):
-        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
+        if self.args.benchmark_max_step:
             profiler.add_profiler_step(self.args.profiler_options)
+        if self.args.benchmark_max_step and self.iteration > self.args.benchmark_max_step:
             logger.info(
                 f"Reach benchmark-max-step: {self.args.benchmark_max_step}")
             sys.exit(
diff --git a/tests/benchmark/conformer/prepare.sh b/tests/benchmark/conformer/prepare.sh
index c5fae06a5..4a7be1b69 100644
--- a/tests/benchmark/conformer/prepare.sh
+++ b/tests/benchmark/conformer/prepare.sh
@@ -2,7 +2,7 @@ cd ../../../
 pip install -e .   # 安装pdspeech
 cd -
 #Enter the example dir
-pushd ../../../examples/aishell/s1
+pushd ../../../examples/aishell/asr1
 
 #Prepare the data
 bash run.sh --stage 0 --stop_stage 0
diff --git a/tests/benchmark/conformer/run.sh b/tests/benchmark/conformer/run.sh
index 79beb4e96..9fd13fbcb 100644
--- a/tests/benchmark/conformer/run.sh
+++ b/tests/benchmark/conformer/run.sh
@@ -8,7 +8,7 @@ cd ${CUR_DIR}
 sed -i '/set\ -xe/d' run_benchmark.sh
 
 #cd **
-pushd ../../../examples/aishell/s1
+pushd ../../../examples/aishell/asr1
 # 1 安装该模型需要的依赖 (如需开启优化策略请注明)
 # 2 拷贝该模型需要数据、预训练模型
 
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index d4efe2b96..5b83b15ce 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -1,5 +1,4 @@
 #!/usr/bin/env bash
-set -xe
 # 运行示例：CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh ${run_mode} ${bs_item} ${fp_item} 500 ${model_mode}
 # 参数说明
 function _set_params(){
@@ -35,13 +34,15 @@ function _set_params(){
 function _train(){
     echo "Train on ${num_gpu_devices} GPUs"
     echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"
-    train_cmd="--config=${config_path}
-               --output=${output}
-               --seed=${seed}
-               --ngpu=${ngpu}
-               --profiler-options "${profiler_options}"
-               --benchmark-batch-size ${batch_size}
-               --benchmark-max-step ${benchmark_max_step} "
+    train_cmd="--config=${config_path} \
+           --output=${output} \
+           --seed=${seed} \
+           --ngpu=${ngpu} \
+           --benchmark-batch-size ${batch_size} \
+           --benchmark-max-step ${benchmark_max_step} "
+    if [ ${profiler_options} != "None" ]; then
+        train_cmd=${train_cmd}" --profiler-options=${profiler_options}"
+    fi
 
     case ${run_mode} in
     sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;

From f50a2ab4ca385ef172f0bad166819a91f3d4a81e Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Tue, 30 Nov 2021 16:52:21 -0800
Subject: [PATCH 32/53] fix bugs

---
 examples/ted_en_zh/st0/local/data.sh | 2 +-
 paddlespeech/s2t/frontend/utility.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/ted_en_zh/st0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh
index 097cd3a85..7ea185db7 100755
--- a/examples/ted_en_zh/st0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -42,7 +42,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     # generate manifests
     python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
     --manifest_prefix="data/manifest" \
-    --src_dir="${data_dir}"
+    --src-dir="${data_dir}"
 
     echo "Complete raw data pre-process."
 fi
diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 175727e17..50fced25e 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -102,9 +102,9 @@ def read_manifest(
     with jsonlines.open(manifest_path, 'r') as reader:
         for json_data in reader:
             feat_len = json_data["input"][0]["shape"][
-                0] if 'shape' in json_data["input"][0] else 1.0
+                0] if "input" in json_data and "shape" in json_data["input"][0] else 1.0
             token_len = json_data["output"][0]["shape"][
-                0] if 'shape' in json_data["output"][0] else 1.0
+                0] if "output" in json_data and "shape" in json_data["input"][0] else 1.0
             conditions = [
                 feat_len >= min_input_len,
                 feat_len <= max_input_len,

From 3a14b82844da615838bf83f07044b42fb4fc19ff Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Tue, 30 Nov 2021 16:57:26 -0800
Subject: [PATCH 33/53] minor

---
 paddlespeech/s2t/frontend/utility.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py
index 50fced25e..e6c7603fa 100644
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
@@ -104,7 +104,7 @@ def read_manifest(
             feat_len = json_data["input"][0]["shape"][
                 0] if "input" in json_data and "shape" in json_data["input"][0] else 1.0
             token_len = json_data["output"][0]["shape"][
-                0] if "output" in json_data and "shape" in json_data["input"][0] else 1.0
+                0] if "output" in json_data and "shape" in json_data["output"][0] else 1.0
             conditions = [
                 feat_len >= min_input_len,
                 feat_len <= max_input_len,

From 5d626aa6b486d0b888c6adf6906ee1311ff73eac Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 1 Dec 2021 02:17:30 +0000
Subject: [PATCH 34/53] fix tiny conf

---
 examples/tiny/asr1/conf/chunk_confermer.yaml   |  2 +-
 examples/tiny/asr1/conf/chunk_transformer.yaml |  2 +-
 examples/tiny/asr1/conf/conformer.yaml         |  2 +-
 examples/tiny/asr1/conf/transformer.yaml       |  2 +-
 utils/pack_model.sh                            | 12 ++++++------
 5 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/examples/tiny/asr1/conf/chunk_confermer.yaml b/examples/tiny/asr1/conf/chunk_confermer.yaml
index 6183a903b..728a82e3c 100644
--- a/examples/tiny/asr1/conf/chunk_confermer.yaml
+++ b/examples/tiny/asr1/conf/chunk_confermer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: ""
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
   augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
diff --git a/examples/tiny/asr1/conf/chunk_transformer.yaml b/examples/tiny/asr1/conf/chunk_transformer.yaml
index 01d383fb8..7c927122b 100644
--- a/examples/tiny/asr1/conf/chunk_transformer.yaml
+++ b/examples/tiny/asr1/conf/chunk_transformer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: ""
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
   augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
diff --git a/examples/tiny/asr1/conf/conformer.yaml b/examples/tiny/asr1/conf/conformer.yaml
index a3fee6901..21cc11286 100644
--- a/examples/tiny/asr1/conf/conformer.yaml
+++ b/examples/tiny/asr1/conf/conformer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: ""
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
   augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
diff --git a/examples/tiny/asr1/conf/transformer.yaml b/examples/tiny/asr1/conf/transformer.yaml
index 5a87d6d24..f4645c681 100644
--- a/examples/tiny/asr1/conf/transformer.yaml
+++ b/examples/tiny/asr1/conf/transformer.yaml
@@ -14,7 +14,7 @@ collator:
   mean_std_filepath: data/mean_std.json
   vocab_filepath: data/lang_char/vocab.txt 
   unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
+  spm_model_prefix: 'data/lang_char/bpe_unigram_200'
   augmentation_config: conf/preprocess.yaml
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
diff --git a/utils/pack_model.sh b/utils/pack_model.sh
index d7df01eb7..fe6ab51bc 100755
--- a/utils/pack_model.sh
+++ b/utils/pack_model.sh
@@ -82,12 +82,12 @@ if [ -e ${e2e} ]; then
 
     e2e_conf=$(dirname ${e2e})/model.json
     if [ ! -e ${e2e_conf} ]; then
-	echo missing ${e2e_conf}
-	#exit 1
+	    echo missing ${e2e_conf}
+	    #exit 1
     else
-	echo -n "    - e2e JSON file: \`"
-	echo ${e2e_conf} | sed -e "s/$/\`/"
-	tar rfh ${outfile}.tar ${e2e_conf}
+	    echo -n "    - e2e JSON file: \`"
+	    echo ${e2e_conf} | sed -e "s/$/\`/"
+	    tar rfh ${outfile}.tar ${e2e_conf}
     fi
 else
     echo "missing ${e2e}"
@@ -104,7 +104,7 @@ if [ -n "${lm}" ]; then
 	lm_conf=$(dirname ${lm})/model.json
 	if [ ! -e ${lm_conf} ]; then
 	    echo missing ${lm_conf}
-	    exit 1
+	    #exit 1
 	else
 	    echo -n "    - lm JSON file: \`"
 	    echo ${lm_conf} | sed -e "s/$/\`/"

From ecbe785e47d254a3e5fe43d935777a133dcdb9c4 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 1 Dec 2021 03:15:01 +0000
Subject: [PATCH 35/53] remove ctc grad norm option

---
 examples/librispeech/asr2/conf/transformer.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/librispeech/asr2/conf/transformer.yaml b/examples/librispeech/asr2/conf/transformer.yaml
index 4a50183a9..3e9350abc 100644
--- a/examples/librispeech/asr2/conf/transformer.yaml
+++ b/examples/librispeech/asr2/conf/transformer.yaml
@@ -31,7 +31,7 @@ model:
     model_conf:
         ctc_weight: 0.3
         ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: batch
+        ctc_grad_norm_type: null 
         lsm_weight: 0.1     # label smoothing option
         length_normalized_loss: false
 

From 989aec441378e10c1ff3f0b98d10aa55efd20fa4 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 1 Dec 2021 06:26:57 +0000
Subject: [PATCH 36/53] optimize the setup.py and setup.sh

---
 docs/source/install.md | 19 ++++++++++----
 requirements.txt       |  6 -----
 setup.cfg              |  3 +++
 setup.py               | 58 ++++++++++++++++++++++++++++++++++++++++--
 setup.sh               |  2 +-
 tools/Makefile         |  7 +----
 6 files changed, 75 insertions(+), 20 deletions(-)

diff --git a/docs/source/install.md b/docs/source/install.md
index d68b990d2..1057242ff 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -10,16 +10,25 @@ To avoid the trouble of environment setup, [running in Docker container](#runnin
 
 For user who working on `Ubuntu` with `root`  privilege.
 
-```python
+```bash
 git clone https://github.com/PaddlePaddle/DeepSpeech.git
-cd DeepSpeech
-pip install -e .
+cd PaddleSpeech
+```
+
+If you want to use the basic function of the repo, you can use:
+```bash
+pip install .
+```
+
+If you want to do the development, you can use:
+```
+pip install -e .[develop]
 ```
 
 For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
 You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.
 
-```python
+```bash
 pushd tools
 bash extras/install_miniconda.sh
 popd
@@ -27,7 +36,7 @@ bash
 ```
 
 After installing the conda, run the setup.sh to complete the installing process.
-```python
+```bash
 bash setup.sh
 ```
 
diff --git a/requirements.txt b/requirements.txt
index 2ee60d3f6..658e64c05 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,23 +1,19 @@
 ConfigArgParse
 coverage
-distro
 editdistance
 g2p_en
 g2pM
 gpustat
-GPUtil
 h5py
 inflect
 jieba
 jsonlines
 kaldiio
 librosa
-llvmlite
 loguru
 matplotlib
 nara_wpe
 nltk
-numba
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
@@ -25,9 +21,7 @@ phkit
 Pillow
 praatio~=4.1
 pre-commit
-psutil
 pybind11
-pynvml
 pypi-kenlm
 pypinyin
 python-dateutil
diff --git a/setup.cfg b/setup.cfg
index 625454805..7bca47cc2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -7,3 +7,6 @@ description-file = README.md
 
 [magformat]
 formatters=yapf
+
+[easy_install]
+index-url=https://pypi.tuna.tsinghua.edu.cn/simple
diff --git a/setup.py b/setup.py
index a4ce181a9..b8c6b3aaf 100644
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,58 @@ from setuptools.command.install import install
 
 HERE = Path(os.path.abspath(os.path.dirname(__file__)))
 
+requirements = {
+    "install": [
+        "editdistance",
+        "g2p_en",
+        "g2pM",
+        "h5py",
+        "inflect",
+        "jieba",
+        "jsonlines",
+        "kaldiio",
+        "librosa",
+        "loguru",
+        "matplotlib",
+        "nara_wpe",
+        "nltk",
+        "pandas",
+        "paddlespeech_ctcdecoders",
+        "paddlespeech_feat",
+        "praatio~=4.1",
+        "pypi-kenlm",
+        "pypinyin",
+        "python-dateutil",
+        "pyworld",
+        "resampy==0.2.2",
+        "sacrebleu",
+        "scipy",
+        "sentencepiece~=0.1.96",
+        "soundfile~=0.10",
+        "sox",
+        "soxbindings",
+        "textgrid",
+        "timer",
+        "tqdm",
+        "typeguard",
+        "visualdl",
+        "webrtcvad",
+        "yacs",
+    ],
+    "develop": [
+        "ConfigArgParse",
+        "coverage",
+        "gpustat",
+        "phkit",
+        "Pillow",
+        "pybind11",
+        "snakeviz",
+        "unidecode",
+        "yq",
+        "pre-commit",
+    ]
+}
+
 
 @contextlib.contextmanager
 def pushd(new_dir):
@@ -130,7 +182,7 @@ class UploadCommand(Command):
 setup_info = dict(
     # Metadata
     name='paddlespeech',
-    version='0.0.1a',
+    version='0.1.0a',
     author='PaddlePaddle Speech and Language Team',
     author_email='paddlesl@baidu.com',
     url='https://github.com/PaddlePaddle/PaddleSpeech',
@@ -158,8 +210,10 @@ setup_info = dict(
         "gan",
     ],
     python_requires='>=3.6',
-    install_requires=[d.strip() for d in read('requirements.txt').split()],
+    install_requires=requirements["install"],
     extras_require={
+        'develop':
+        requirements["develop"],
         'doc': [
             "sphinx", "sphinx-rtd-theme", "numpydoc", "myst_parser",
             "recommonmark>=0.5.0", "sphinx-markdown-tables", "sphinx-autobuild"
diff --git a/setup.sh b/setup.sh
index 0bfacb548..c2ed16d7a 100644
--- a/setup.sh
+++ b/setup.sh
@@ -2,7 +2,7 @@
 conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
 
 # Install the python lib
-pip install -r requirements.txt
+pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
 
 # Install the auto_log
 pushd tools/extras
diff --git a/tools/Makefile b/tools/Makefile
index 2a2c1463d..1e0c2dfbf 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -10,7 +10,7 @@ WGET ?= wget --no-check-certificate
 
 .PHONY: all clean
 
-all: virtualenv.done apt.done kenlm.done sox.done soxbindings.done mfa.done sclite.done
+all: virtualenv.done apt.done kenlm.done sox.done mfa.done sclite.done
 
 virtualenv.done:
 	test -d venv || virtualenv -p $(PYTHON) venv
@@ -45,11 +45,6 @@ sox.done:
 	cd sox-14.4.2 && ./configure --prefix=/usr/ && make -j4 && make install
 	touch sox.done
 
-soxbindings.done:
-	test -d soxbindings || git clone https://github.com/pseeth/soxbindings.git
-	cd soxbindings && python setup.py install
-	touch soxbindings.done
-
 mfa.done:
 	test -d montreal-forced-aligner || $(WGET) https://paddlespeech.bj.bcebos.com/Parakeet/montreal-forced-aligner_linux.tar.gz
 	tar xvf montreal-forced-aligner_linux.tar.gz

From 72a8c9337cb9e70d27509a9b2dcd5cda70ff0db0 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Wed, 1 Dec 2021 00:21:57 -0800
Subject: [PATCH 37/53] update data process

---
 ...nt_noam.yaml => transformer_mtl_noam.yaml} |  0
 examples/ted_en_zh/st0/local/data.sh          |  3 ++-
 examples/ted_en_zh/st0/run.sh                 |  2 +-
 utils/build_vocab.py                          | 21 +++++++++++++++----
 4 files changed, 20 insertions(+), 6 deletions(-)
 rename examples/ted_en_zh/st0/conf/{transformer_joint_noam.yaml => transformer_mtl_noam.yaml} (100%)

diff --git a/examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
similarity index 100%
rename from examples/ted_en_zh/st0/conf/transformer_joint_noam.yaml
rename to examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
diff --git a/examples/ted_en_zh/st0/local/data.sh b/examples/ted_en_zh/st0/local/data.sh
index 7ea185db7..c4de1749e 100755
--- a/examples/ted_en_zh/st0/local/data.sh
+++ b/examples/ted_en_zh/st0/local/data.sh
@@ -76,8 +76,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     --spm_vocab_size=${nbpe} \
     --spm_mode ${bpemode} \
     --spm_model_prefix ${bpeprefix} \
+    --spm_character_coverage 1. \
     --vocab_path="${dict_dir}/vocab.txt" \
-    --text_keys 'text' 'text1' \
+    --text_keys 'text' \
     --manifest_paths="data/manifest.train.raw"
 
     if [ $? -ne 0 ]; then
diff --git a/examples/ted_en_zh/st0/run.sh b/examples/ted_en_zh/st0/run.sh
index fb4bc3388..bc5ee4e60 100755
--- a/examples/ted_en_zh/st0/run.sh
+++ b/examples/ted_en_zh/st0/run.sh
@@ -5,7 +5,7 @@ source path.sh
 gpus=0,1,2,3
 stage=0
 stop_stage=100
-conf_path=conf/transformer_joint_noam.yaml
+conf_path=conf/transformer_mtl_noam.yaml
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index f832cbbc3..e364e821e 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -55,6 +55,8 @@ add_arg('text_keys', str,
 add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
 add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
 add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
+add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
+
 # yapf: disable
 args = parser.parse_args()
 
@@ -66,8 +68,14 @@ def count_manifest(counter, text_feature, manifest_path):
             manifest_jsons.append(json_data)
 
     for line_json in manifest_jsons:
-        line = text_feature.tokenize(line_json['text'], replace_space=False)
-        counter.update(line)
+        if isinstance(line_json['text'], str):
+            line = text_feature.tokenize(line_json['text'], replace_space=False)
+            counter.update(line)
+        else:
+            assert isinstance(line_json['text'], list)
+            for text in line_json['text']:
+                line = text_feature.tokenize(text, replace_space=False)
+                counter.update(line)
 
 def dump_text_manifest(fileobj, manifest_path, key='text'):
     manifest_jsons = []
@@ -76,7 +84,12 @@ def dump_text_manifest(fileobj, manifest_path, key='text'):
             manifest_jsons.append(json_data)
 
     for line_json in manifest_jsons:
-        fileobj.write(line_json[key] + "\n")
+        if isinstance(line_json[key], str):
+            fileobj.write(line_json[key] + "\n")
+        else:
+            assert isinstance(line_json[key], list)
+            for line in line_json[key]:
+                fileobj.write(line + "\n")
 
 def main():
     print_arguments(args, globals())
@@ -104,7 +117,7 @@ def main():
             model_type=args.spm_mode,
             model_prefix=args.spm_model_prefix,
             input_sentence_size=100000000,
-            character_coverage=0.9995)
+            character_coverage=args.spm_character_coverage)
         os.unlink(fp.name)
 
     # encode

From 3fadcde5e259aa32a5f9af59843eddf4b0c22b63 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 1 Dec 2021 10:26:18 +0000
Subject: [PATCH 38/53] revise the asr infer.py

---
 paddlespeech/cli/asr/infer.py | 119 +++++++++++++++++++++++++++++-----
 1 file changed, 102 insertions(+), 17 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index e5c64e9ab..a0ae53507 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -18,17 +18,17 @@ from typing import List
 from typing import Optional
 from typing import Union
 
+import librosa
 import paddle
 import soundfile
+from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
 from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import logger
 from ..utils import MODEL_HOME
-from paddlespeech.s2t.exps.u2.config import get_cfg_defaults
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.io.collator import SpeechCollator
 from paddlespeech.s2t.transform.transformation import Transformation
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.utility import UpdateConfig
@@ -36,7 +36,7 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 __all__ = ['ASRExecutor']
 
 pretrained_models = {
-    "wenetspeech_zh": {
+    "wenetspeech_zh_16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz',
         'md5':
@@ -73,7 +73,15 @@ class ASRExecutor(BaseExecutor):
             default='wenetspeech',
             help='Choose model type of asr task.')
         self.parser.add_argument(
-            '--lang', type=str, default='zh', help='Choose model language.')
+            '--lang',
+            type=str,
+            default='zh',
+            help='Choose model language. zh or en')
+        self.parser.add_argument(
+            "--model_sample_rate",
+            type=int,
+            default=16000,
+            help='Choose the audio sample rate of the model. 8000 or 16000')
         self.parser.add_argument(
             '--config',
             type=str,
@@ -109,13 +117,15 @@ class ASRExecutor(BaseExecutor):
     def _init_from_path(self,
                         model_type: str='wenetspeech',
                         lang: str='zh',
+                        model_sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
                         ckpt_path: Optional[os.PathLike]=None):
         """
             Init model and other resources from a specific path.
         """
         if cfg_path is None or ckpt_path is None:
-            tag = model_type + '_' + lang
+            model_sample_rate_str = '16k' if model_sample_rate == 16000 else '8k'
+            tag = model_type + '_' + lang + '_' + model_sample_rate_str
             res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
             self.cfg_path = os.path.join(res_path,
                                          pretrained_models[tag]['cfg_path'])
@@ -136,23 +146,24 @@ class ASRExecutor(BaseExecutor):
         #Init body.
         parser_args = self.parser_args
         paddle.set_device(parser_args.device)
-        self.config = get_cfg_defaults()
+        self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)
         self.config.decoding.decoding_method = "attention_rescoring"
-        #self.config.freeze()
         model_conf = self.config.model
         logger.info(model_conf)
 
         with UpdateConfig(model_conf):
             if parser_args.model == "ds2_online" or parser_args.model == "ds2_offline":
+                from paddlespeech.s2t.io.collator import SpeechCollator
                 self.config.collator.vocab_filepath = os.path.join(
                     res_path, self.config.collator.vocab_filepath)
                 self.config.collator.vocab_filepath = os.path.join(
                     res_path, self.config.collator.cmvn_path)
                 self.collate_fn_test = SpeechCollator.from_config(self.config)
-                model_conf.feat_size = self.collate_fn_test.feature_size
-                model_conf.dict_size = self.text_feature.vocab_size
+                model_conf.input_dim = self.collate_fn_test.feature_size
+                model_conf.output_dim = self.text_feature.vocab_size
             elif parser_args.model == "conformer" or parser_args.model == "transformer" or parser_args.model == "wenetspeech":
+
                 self.config.collator.vocab_filepath = os.path.join(
                     res_path, self.config.collator.vocab_filepath)
                 self.text_feature = TextFeaturizer(
@@ -163,6 +174,7 @@ class ASRExecutor(BaseExecutor):
                 model_conf.output_dim = self.text_feature.vocab_size
             else:
                 raise Exception("wrong type")
+        self.config.freeze()
         model_class = dynamic_import(parser_args.model, model_alias)
         model = model_class.from_config(model_conf)
         self.model = model
@@ -182,13 +194,13 @@ class ASRExecutor(BaseExecutor):
         parser_args = self.parser_args
         config = self.config
         audio_file = input
-        logger.info("audio_file" + audio_file)
+        logger.info("Preprocess audio_file:" + audio_file)
 
         self.sr = config.collator.target_sample_rate
 
         # Get the object for feature extraction
         if parser_args.model == "ds2_online" or parser_args.model == "ds2_offline":
-            audio, _ = collate_fn_test.process_utterance(
+            audio, _ = self.collate_fn_test.process_utterance(
                 audio_file=audio_file, transcript=" ")
             audio_len = audio.shape[0]
             audio = paddle.to_tensor(audio, dtype='float32')
@@ -203,18 +215,30 @@ class ASRExecutor(BaseExecutor):
                 os.path.dirname(os.path.abspath(self.cfg_path)),
                 "preprocess.yaml")
 
-            cmvn_path: data / mean_std.json
-
             logger.info(preprocess_conf)
             preprocess_args = {"train": False}
             preprocessing = Transformation(preprocess_conf)
+            logger.info("read the audio file")
             audio, sample_rate = soundfile.read(
                 audio_file, dtype="int16", always_2d=True)
+
+            if self.change_format:
+                if audio.shape[1] >= 2:
+                    audio = audio.mean(axis=1)
+                else:
+                    audio = audio[:, 0]
+                audio = audio.astype("float32")
+                audio = librosa.resample(audio, sample_rate,
+                                         self.target_sample_rate)
+                sample_rate = self.target_sample_rate
+                audio = audio.astype("int16")
+            else:
+                audio = audio[:, 0]
+
             if sample_rate != self.sr:
                 logger.error(
                     f"sample rate error: {sample_rate}, need {self.sr} ")
                 sys.exit(-1)
-            audio = audio[:, 0]
             logger.info(f"audio shape: {audio.shape}")
             # fbank
             audio = preprocessing(audio, **preprocess_args)
@@ -282,6 +306,63 @@ class ASRExecutor(BaseExecutor):
         """
         return self.result_transcripts
 
+    def _check(self, audio_file: str, model_sample_rate: int):
+        self.target_sample_rate = model_sample_rate
+        if self.target_sample_rate != 16000 and self.target_sample_rate != 8000:
+            logger.error(
+                "please input --model_sample_rate 8000 or --model_sample_rate 16000")
+            raise Exception("invalid sample rate")
+            sys.exit(-1)
+
+        if not os.path.isfile(audio_file):
+            logger.error("Please input the right audio file path")
+            sys.exit(-1)
+
+        logger.info("checking the audio file format......")
+        try:
+            sig, sample_rate = soundfile.read(
+                audio_file, dtype="int16", always_2d=True)
+        except Exception as e:
+            logger.error(str(e))
+            logger.error(
+                "can not open the audio file, please check the audio file format is 'wav'. \n \
+                 you can try to use sox to change the file format.\n \
+                 For example: \n \
+                 sample rate: 16k \n \
+                 sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \
+                 sample rate: 8k \n \
+                 sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
+                 ")
+            sys.exit(-1)
+        logger.info("The sample rate is %d" % sample_rate)
+        if sample_rate != self.target_sample_rate:
+            logger.warning("The sample rate of the input file is not {}.\n \
+                            The program will resample the wav file to {}.\n \
+                            If the result does not meet your expectations，\n \
+                            Please input the 16k 16bit 1 channel wav file. \
+                        ".format(self.target_sample_rate, self.target_sample_rate))
+            while (True):
+                logger.info(
+                    "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
+                )
+                content = input("Input(Y/N):")
+                if content.strip() == "Y" or content.strip(
+                ) == "y" or content.strip() == "yes" or content.strip() == "Yes":
+                    logger.info(
+                        "change the sampele rate, channel to 16k and 1 channel")
+                    break
+                elif content.strip() == "N" or content.strip(
+                ) == "n" or content.strip() == "no" or content.strip() == "No":
+                    logger.info("Exit the program")
+                    exit(1)
+                else:
+                    logger.warning("Not regular input, please input again")
+
+            self.change_format = True
+        else:
+            logger.info("The audio file format is right")
+            self.change_format = False
+
     def execute(self, argv: List[str]) -> bool:
         """
             Command line entry.
@@ -290,24 +371,28 @@ class ASRExecutor(BaseExecutor):
 
         model = self.parser_args.model
         lang = self.parser_args.lang
+        model_sample_rate = self.parser_args.model_sample_rate
         config = self.parser_args.config
         ckpt_path = self.parser_args.ckpt_path
         audio_file = os.path.abspath(self.parser_args.input)
         device = self.parser_args.device
 
         try:
-            res = self(model, lang, config, ckpt_path, audio_file, device)
+            res = self(model, lang, model_sample_rate, config, ckpt_path, audio_file,
+                       device)
             logger.info('ASR Result: {}'.format(res))
             return True
         except Exception as e:
             print(e)
             return False
 
-    def __call__(self, model, lang, config, ckpt_path, audio_file, device):
+    def __call__(self, model, lang, model_sample_rate, config, ckpt_path, audio_file,
+                 device):
         """
             Python API to call an executor.
         """
-        self._init_from_path(model, lang, config, ckpt_path)
+        self._check(audio_file, model_sample_rate)
+        self._init_from_path(model, lang, model_sample_rate, config, ckpt_path)
         self.preprocess(audio_file)
         self.infer()
         res = self.postprocess()  # Retrieve result of asr.

From 44e9b032d5a247e09524d7b0776db6c5fb4a11aa Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 1 Dec 2021 19:09:20 +0800
Subject: [PATCH 39/53] Update inputs and outputs of executor.

---
 paddlespeech/cli/executor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index e307a287b..c132b3b87 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -26,8 +26,8 @@ class BaseExecutor(ABC):
     """
 
     def __init__(self):
-        self.input = None
-        self.output = None
+        self._inputs = dict()
+        self._outputs = dict()
 
     @abstractmethod
     def _get_pretrained_path(self, tag: str) -> os.PathLike:

From 90d648a601d64aefea0dd9d4a63a87eede1a8b09 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 1 Dec 2021 12:12:55 +0000
Subject: [PATCH 40/53] support using by __call__

---
 paddlespeech/cli/asr/infer.py | 138 ++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 63 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index a0ae53507..ea1828b6b 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -119,7 +119,8 @@ class ASRExecutor(BaseExecutor):
                         lang: str='zh',
                         model_sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
-                        ckpt_path: Optional[os.PathLike]=None):
+                        ckpt_path: Optional[os.PathLike]=None,
+                        device: str='cpu'):
         """
             Init model and other resources from a specific path.
         """
@@ -140,12 +141,8 @@ class ASRExecutor(BaseExecutor):
             res_path = os.path.dirname(
                 os.path.dirname(os.path.abspath(self.cfg_path)))
 
-        # Enter the path of model root
-        os.chdir(res_path)
-
         #Init body.
-        parser_args = self.parser_args
-        paddle.set_device(parser_args.device)
+        paddle.set_device(device)
         self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)
         self.config.decoding.decoding_method = "attention_rescoring"
@@ -153,29 +150,35 @@ class ASRExecutor(BaseExecutor):
         logger.info(model_conf)
 
         with UpdateConfig(model_conf):
-            if parser_args.model == "ds2_online" or parser_args.model == "ds2_offline":
+            if model_type == "ds2_online" or model_type == "ds2_offline":
                 from paddlespeech.s2t.io.collator import SpeechCollator
                 self.config.collator.vocab_filepath = os.path.join(
                     res_path, self.config.collator.vocab_filepath)
-                self.config.collator.vocab_filepath = os.path.join(
+                self.config.collator.mean_std_filepath = os.path.join(
                     res_path, self.config.collator.cmvn_path)
                 self.collate_fn_test = SpeechCollator.from_config(self.config)
+                text_feature = TextFeaturizer(
+                    unit_type=self.config.collator.unit_type,
+                    vocab_filepath=self.config.collator.vocab_filepath,
+                    spm_model_prefix=self.config.collator.spm_model_prefix)
                 model_conf.input_dim = self.collate_fn_test.feature_size
-                model_conf.output_dim = self.text_feature.vocab_size
-            elif parser_args.model == "conformer" or parser_args.model == "transformer" or parser_args.model == "wenetspeech":
-
+                model_conf.output_dim = text_feature.vocab_size
+            elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
                 self.config.collator.vocab_filepath = os.path.join(
                     res_path, self.config.collator.vocab_filepath)
-                self.text_feature = TextFeaturizer(
+                text_feature = TextFeaturizer(
                     unit_type=self.config.collator.unit_type,
                     vocab_filepath=self.config.collator.vocab_filepath,
                     spm_model_prefix=self.config.collator.spm_model_prefix)
                 model_conf.input_dim = self.config.collator.feat_dim
-                model_conf.output_dim = self.text_feature.vocab_size
+                model_conf.output_dim = text_feature.vocab_size
             else:
                 raise Exception("wrong type")
         self.config.freeze()
-        model_class = dynamic_import(parser_args.model, model_alias)
+        # Enter the path of model root
+        os.chdir(res_path)
+
+        model_class = dynamic_import(model_type, model_alias)
         model = model_class.from_config(model_conf)
         self.model = model
         self.model.eval()
@@ -185,31 +188,31 @@ class ASRExecutor(BaseExecutor):
         model_dict = paddle.load(params_path)
         self.model.set_state_dict(model_dict)
 
-    def preprocess(self, input: Union[str, os.PathLike]):
+    def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
         """
             Input preprocess and return paddle.Tensor stored in self.input.
             Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
         """
 
-        parser_args = self.parser_args
-        config = self.config
         audio_file = input
         logger.info("Preprocess audio_file:" + audio_file)
 
-        self.sr = config.collator.target_sample_rate
+        config_target_sample_rate = self.config.collator.target_sample_rate
 
         # Get the object for feature extraction
-        if parser_args.model == "ds2_online" or parser_args.model == "ds2_offline":
+        if model_type == "ds2_online" or model_type == "ds2_offline":
             audio, _ = self.collate_fn_test.process_utterance(
                 audio_file=audio_file, transcript=" ")
             audio_len = audio.shape[0]
             audio = paddle.to_tensor(audio, dtype='float32')
-            self.audio_len = paddle.to_tensor(audio_len)
-            self.audio = paddle.unsqueeze(audio, axis=0)
-            self.vocab_list = collate_fn_test.vocab_list
-            logger.info(f"audio feat shape: {self.audio.shape}")
-
-        elif parser_args.model == "conformer" or parser_args.model == "transformer" or parser_args.model == "wenetspeech":
+            audio_len = paddle.to_tensor(audio_len)
+            audio = paddle.unsqueeze(audio, axis=0)
+            vocab_list = collate_fn_test.vocab_list
+            self._inputs["audio"] = audio
+            self._inputs["audio_len"] = audio_len
+            logger.info(f"audio feat shape: {audio.shape}")
+
+        elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
             logger.info("get the preprocess conf")
             preprocess_conf = os.path.join(
                 os.path.dirname(os.path.abspath(self.cfg_path)),
@@ -235,7 +238,7 @@ class ASRExecutor(BaseExecutor):
             else:
                 audio = audio[:, 0]
 
-            if sample_rate != self.sr:
+            if sample_rate != config_target_sample_rate:
                 logger.error(
                     f"sample rate error: {sample_rate}, need {self.sr} ")
                 sys.exit(-1)
@@ -243,29 +246,36 @@ class ASRExecutor(BaseExecutor):
             # fbank
             audio = preprocessing(audio, **preprocess_args)
 
-            self.audio_len = paddle.to_tensor(audio.shape[0])
-            self.audio = paddle.to_tensor(
-                audio, dtype='float32').unsqueeze(axis=0)
-            logger.info(f"audio feat shape: {self.audio.shape}")
+            audio_len = paddle.to_tensor(audio.shape[0])
+            audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
+            text_feature = TextFeaturizer(
+                unit_type=self.config.collator.unit_type,
+                vocab_filepath=self.config.collator.vocab_filepath,
+                spm_model_prefix=self.config.collator.spm_model_prefix)
+            self._inputs["audio"] = audio
+            self._inputs["audio_len"] = audio_len
+            logger.info(f"audio feat shape: {audio.shape}")
 
         else:
             raise Exception("wrong type")
 
     @paddle.no_grad()
-    def infer(self):
+    def infer(self, model_type: str):
         """
             Model inference and result stored in self.output.
         """
+        text_feature = TextFeaturizer(
+            unit_type=self.config.collator.unit_type,
+            vocab_filepath=self.config.collator.vocab_filepath,
+            spm_model_prefix=self.config.collator.spm_model_prefix)
         cfg = self.config.decoding
-        parser_args = self.parser_args
-        audio = self.audio
-        audio_len = self.audio_len
-        if parser_args.model == "ds2_online" or parser_args.model == "ds2_offline":
-            vocab_list = self.vocab_list
+        audio = self._inputs["audio"]
+        audio_len = self._inputs["audio_len"]
+        if model_type == "ds2_online" or model_type == "ds2_offline":
             result_transcripts = self.model.decode(
                 audio,
                 audio_len,
-                vocab_list,
+                text_feature.vocab_list,
                 decoding_method=cfg.decoding_method,
                 lang_model_path=cfg.lang_model_path,
                 beam_alpha=cfg.alpha,
@@ -274,14 +284,13 @@ class ASRExecutor(BaseExecutor):
                 cutoff_prob=cfg.cutoff_prob,
                 cutoff_top_n=cfg.cutoff_top_n,
                 num_processes=cfg.num_proc_bsearch)
-            self.result_transcripts = result_transcripts[0]
+            self._outputs["result"] = result_transcripts[0]
 
-        elif parser_args.model == "conformer" or parser_args.model == "transformer" or parser_args.model == "wenetspeech":
-            text_feature = self.text_feature
+        elif model_type == "conformer" or model_type == "transformer" or model_type == "wenetspeech":
             result_transcripts = self.model.decode(
                 audio,
                 audio_len,
-                text_feature=self.text_feature,
+                text_feature=text_feature,
                 decoding_method=cfg.decoding_method,
                 lang_model_path=cfg.lang_model_path,
                 beam_alpha=cfg.alpha,
@@ -294,23 +303,22 @@ class ASRExecutor(BaseExecutor):
                 decoding_chunk_size=cfg.decoding_chunk_size,
                 num_decoding_left_chunks=cfg.num_decoding_left_chunks,
                 simulate_streaming=cfg.simulate_streaming)
-            self.result_transcripts = result_transcripts[0][0]
+            self._outputs["result"] = result_transcripts[0][0]
         else:
             raise Exception("invalid model name")
 
-        pass
-
     def postprocess(self) -> Union[str, os.PathLike]:
         """
             Output postprocess and return human-readable results such as texts and audio files.
         """
-        return self.result_transcripts
+        return self._outputs["result"]
 
     def _check(self, audio_file: str, model_sample_rate: int):
         self.target_sample_rate = model_sample_rate
         if self.target_sample_rate != 16000 and self.target_sample_rate != 8000:
             logger.error(
-                "please input --model_sample_rate 8000 or --model_sample_rate 16000")
+                "please input --model_sample_rate 8000 or --model_sample_rate 16000"
+            )
             raise Exception("invalid sample rate")
             sys.exit(-1)
 
@@ -336,11 +344,13 @@ class ASRExecutor(BaseExecutor):
             sys.exit(-1)
         logger.info("The sample rate is %d" % sample_rate)
         if sample_rate != self.target_sample_rate:
-            logger.warning("The sample rate of the input file is not {}.\n \
+            logger.warning(
+                "The sample rate of the input file is not {}.\n \
                             The program will resample the wav file to {}.\n \
                             If the result does not meet your expectations，\n \
                             Please input the 16k 16bit 1 channel wav file. \
-                        ".format(self.target_sample_rate, self.target_sample_rate))
+                        "
+                .format(self.target_sample_rate, self.target_sample_rate))
             while (True):
                 logger.info(
                     "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
@@ -367,34 +377,36 @@ class ASRExecutor(BaseExecutor):
         """
             Command line entry.
         """
-        self.parser_args = self.parser.parse_args(argv)
+        parser_args = self.parser.parse_args(argv)
 
-        model = self.parser_args.model
-        lang = self.parser_args.lang
-        model_sample_rate = self.parser_args.model_sample_rate
-        config = self.parser_args.config
-        ckpt_path = self.parser_args.ckpt_path
-        audio_file = os.path.abspath(self.parser_args.input)
-        device = self.parser_args.device
+        model = parser_args.model
+        lang = parser_args.lang
+        model_sample_rate = parser_args.model_sample_rate
+        config = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        audio_file = parser_args.input
+        device = parser_args.device
 
         try:
-            res = self(model, lang, model_sample_rate, config, ckpt_path, audio_file,
-                       device)
+            res = self(model, lang, model_sample_rate, config, ckpt_path,
+                       audio_file, device)
             logger.info('ASR Result: {}'.format(res))
             return True
         except Exception as e:
             print(e)
             return False
 
-    def __call__(self, model, lang, model_sample_rate, config, ckpt_path, audio_file,
-                 device):
+    def __call__(self, model, lang, model_sample_rate, config, ckpt_path,
+                 audio_file, device):
         """
             Python API to call an executor.
         """
+        audio_file = os.path.abspath(audio_file)
         self._check(audio_file, model_sample_rate)
-        self._init_from_path(model, lang, model_sample_rate, config, ckpt_path)
-        self.preprocess(audio_file)
-        self.infer()
+        self._init_from_path(model, lang, model_sample_rate, config, ckpt_path,
+                             device)
+        self.preprocess(model, audio_file)
+        self.infer(model)
         res = self.postprocess()  # Retrieve result of asr.
 
         return res

From e0642ffc772d80a0d8fdc34c2f4d93c704c74571 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 1 Dec 2021 20:36:55 +0800
Subject: [PATCH 41/53] Update doc strings.

---
 paddlespeech/cli/executor.py | 43 +++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index c132b3b87..00371371d 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -14,6 +14,7 @@
 import os
 from abc import ABC
 from abc import abstractmethod
+from typing import Any
 from typing import List
 from typing import Union
 
@@ -32,50 +33,70 @@ class BaseExecutor(ABC):
     @abstractmethod
     def _get_pretrained_path(self, tag: str) -> os.PathLike:
         """
-            Download and returns pretrained resources path of current task.
+        Download and returns pretrained resources path of current task.
+
+        Args:
+            tag (str): A tag of pretrained model.
+
+        Returns:
+            os.PathLike: The path on which resources of pretrained model locate. 
         """
         pass
 
     @abstractmethod
     def _init_from_path(self, *args, **kwargs):
         """
-            Init model and other resources from a specific path.
+        Init model and other resources from arguments. This method should be called by `__call__()`.
         """
         pass
 
     @abstractmethod
-    def preprocess(self, input: Union[str, os.PathLike]):
+    def preprocess(self, input: Any, *args, **kwargs):
         """
-            Input preprocess and return paddle.Tensor stored in self.input.
-            Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
+        Input preprocess and return paddle.Tensor stored in self._inputs.
+        Input content can be a text(tts), a file(asr, cls), a stream(not supported yet) or anything needed.
+
+        Args:
+            input (Any): Input text/file/stream or other content.
         """
         pass
 
     @paddle.no_grad()
     @abstractmethod
-    def infer(self, device: str):
+    def infer(self, *args, **kwargs):
         """
-            Model inference and result stored in self.output.
+        Model inference and put results into self._outputs.
+        This method get input tensors from self._inputs, and write output tensors into self._outputs.
         """
         pass
 
     @abstractmethod
-    def postprocess(self) -> Union[str, os.PathLike]:
+    def postprocess(self, *args, **kwargs) -> Union[str, os.PathLike]:
         """
-            Output postprocess and return human-readable results such as texts and audio files.
+        Output postprocess and return results.
+        This method get model output from self._outputs and convert it into human-readable results.
+
+        Returns:
+            Union[str, os.PathLike]: Human-readable results such as texts and audio files.
         """
         pass
 
     @abstractmethod
     def execute(self, argv: List[str]) -> bool:
         """
-            Command line entry.
+        Command line entry. This method can only be accessed by a command line such as `paddlespeech asr`.
+
+        Args:
+            argv (List[str]): Arguments from command line.
+
+        Returns:
+            int: Result of the command execution. `True` for a success and `False` for a failure.
         """
         pass
 
     @abstractmethod
     def __call__(self, *arg, **kwargs):
         """
-            Python API to call an executor.
+        Python API to call an executor.
         """
         pass

From a19e51d7da83a6794652ac2d965f6cd880a10b86 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 1 Dec 2021 20:45:34 +0800
Subject: [PATCH 42/53] Update python api.

---
 paddlespeech/cli/asr/infer.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index ea1828b6b..00216356c 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -119,8 +119,7 @@ class ASRExecutor(BaseExecutor):
                         lang: str='zh',
                         model_sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
-                        ckpt_path: Optional[os.PathLike]=None,
-                        device: str='cpu'):
+                        ckpt_path: Optional[os.PathLike]=None):
         """
             Init model and other resources from a specific path.
         """
@@ -142,7 +141,6 @@ class ASRExecutor(BaseExecutor):
                 os.path.dirname(os.path.abspath(self.cfg_path)))
 
         #Init body.
-        paddle.set_device(device)
         self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)
         self.config.decoding.decoding_method = "attention_rescoring"
@@ -403,8 +401,9 @@ class ASRExecutor(BaseExecutor):
         """
         audio_file = os.path.abspath(audio_file)
         self._check(audio_file, model_sample_rate)
-        self._init_from_path(model, lang, model_sample_rate, config, ckpt_path,
-                             device)
+
+        paddle.set_device(device)
+        self._init_from_path(model, lang, model_sample_rate, config, ckpt_path)
         self.preprocess(model, audio_file)
         self.infer(model)
         res = self.postprocess()  # Retrieve result of asr.

From 4e31a4445d42ddd303330a39d185cfc110f63645 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Wed, 1 Dec 2021 15:12:38 -0800
Subject: [PATCH 43/53] eval mode

---
 paddlespeech/s2t/exps/u2_st/model.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index 2dbbdcd30..144eed9d8 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -432,6 +432,7 @@ class U2STTester(U2STTrainer):
         """"E2E translation from extracted audio feature"""
         cfg = self.config.decoding
         text_feature = self.test_loader.collate_fn.text_feature
+        self.model.eval()
 
         hyps = self.model.decode(
             audio,

From 1f3357f2d28f03a97691b9e6ffdc53730a5d914b Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Wed, 1 Dec 2021 15:13:53 -0800
Subject: [PATCH 44/53] minor

---
 examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
index 8256f7160..7e886cca3 100644
--- a/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
+++ b/examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
@@ -86,7 +86,7 @@ training:
   scheduler_conf:
     warmup_steps: 25000
     lr_decay: 1.0
-  log_interval: 5
+  log_interval: 50
   checkpoint:
     kbest_n: 50
     latest_n: 5

From 27087de5e9b37402785275363873a1b80e7c093a Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 2 Dec 2021 03:00:25 +0000
Subject: [PATCH 45/53] update librispeech asr1 transformer result

---
 examples/librispeech/asr1/RESULTS.md          |  8 +-
 .../librispeech/asr1/conf/transformer.yaml    | 74 +++++++++----------
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/examples/librispeech/asr1/RESULTS.md b/examples/librispeech/asr1/RESULTS.md
index 3dad7acbd..a5257c9d4 100644
--- a/examples/librispeech/asr1/RESULTS.md
+++ b/examples/librispeech/asr1/RESULTS.md
@@ -21,7 +21,7 @@
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.725063021977743 | 0.047417 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.725063021977743 | 0.053922 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.725063021977743 | 0.053180 |  
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.725063021977743 | 0.041026 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention | 6.484564081827799 | 0.044355 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_greedy_search | 6.484564081827799 | 0.050479 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | ctc_prefix_beam_search | 6.484564081827799 | 0.049890 |  
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug  | test-clean | attention_rescoring | 6.484564081827799 | 0.039200 |  
\ No newline at end of file
diff --git a/examples/librispeech/asr1/conf/transformer.yaml b/examples/librispeech/asr1/conf/transformer.yaml
index 1806f3fd6..0cc0dae63 100644
--- a/examples/librispeech/asr1/conf/transformer.yaml
+++ b/examples/librispeech/asr1/conf/transformer.yaml
@@ -1,3 +1,40 @@
+# network architecture
+model:
+    cmvn_file: 
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: null 
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+
+
 # https://yaml.org/type/float.html
 data:
   train_manifest: data/manifest.train
@@ -36,43 +73,6 @@ collator:
   num_workers: 2
 
 
-# network architecture
-model:
-    cmvn_file: 
-    cmvn_file_type: "json"
-    # encoder related
-    encoder: transformer
-    encoder_conf:
-        output_size: 256    # dimension of attention
-        attention_heads: 4
-        linear_units: 2048  # the number of units of position-wise feed forward
-        num_blocks: 12      # the number of encoder blocks
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        attention_dropout_rate: 0.0
-        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
-        normalize_before: true
-
-    # decoder related
-    decoder: transformer
-    decoder_conf:
-        attention_heads: 4
-        linear_units: 2048
-        num_blocks: 6
-        dropout_rate: 0.1
-        positional_dropout_rate: 0.1
-        self_attention_dropout_rate: 0.0
-        src_attention_dropout_rate: 0.0
-
-    # hybrid CTC/attention
-    model_conf:
-        ctc_weight: 0.3
-        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: null 
-        lsm_weight: 0.1     # label smoothing option
-        length_normalized_loss: false
-
-
 training:
   n_epoch: 120 
   accum_grad: 4

From aee530af2773e5127d28727468d4c30128ba3527 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 2 Dec 2021 03:24:07 +0000
Subject: [PATCH 46/53] revise the sample rate

---
 paddlespeech/cli/asr/infer.py | 48 +++++++++++++++--------------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index ea1828b6b..c9ec058cd 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -78,7 +78,7 @@ class ASRExecutor(BaseExecutor):
             default='zh',
             help='Choose model language. zh or en')
         self.parser.add_argument(
-            "--model_sample_rate",
+            "--sr",
             type=int,
             default=16000,
             help='Choose the audio sample rate of the model. 8000 or 16000')
@@ -117,7 +117,7 @@ class ASRExecutor(BaseExecutor):
     def _init_from_path(self,
                         model_type: str='wenetspeech',
                         lang: str='zh',
-                        model_sample_rate: int=16000,
+                        sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
                         ckpt_path: Optional[os.PathLike]=None,
                         device: str='cpu'):
@@ -125,8 +125,8 @@ class ASRExecutor(BaseExecutor):
             Init model and other resources from a specific path.
         """
         if cfg_path is None or ckpt_path is None:
-            model_sample_rate_str = '16k' if model_sample_rate == 16000 else '8k'
-            tag = model_type + '_' + lang + '_' + model_sample_rate_str
+            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+            tag = model_type + '_' + lang + '_' + sample_rate_str
             res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
             self.cfg_path = os.path.join(res_path,
                                          pretrained_models[tag]['cfg_path'])
@@ -197,8 +197,6 @@ class ASRExecutor(BaseExecutor):
         audio_file = input
         logger.info("Preprocess audio_file:" + audio_file)
 
-        config_target_sample_rate = self.config.collator.target_sample_rate
-
         # Get the object for feature extraction
         if model_type == "ds2_online" or model_type == "ds2_offline":
             audio, _ = self.collate_fn_test.process_utterance(
@@ -222,7 +220,7 @@ class ASRExecutor(BaseExecutor):
             preprocess_args = {"train": False}
             preprocessing = Transformation(preprocess_conf)
             logger.info("read the audio file")
-            audio, sample_rate = soundfile.read(
+            audio, audio_sample_rate = soundfile.read(
                 audio_file, dtype="int16", always_2d=True)
 
             if self.change_format:
@@ -231,17 +229,13 @@ class ASRExecutor(BaseExecutor):
                 else:
                     audio = audio[:, 0]
                 audio = audio.astype("float32")
-                audio = librosa.resample(audio, sample_rate,
-                                         self.target_sample_rate)
-                sample_rate = self.target_sample_rate
+                audio = librosa.resample(audio, audio_sample_rate,
+                                         self.sample_rate)
+                audio_sample_rate = self.sample_rate
                 audio = audio.astype("int16")
             else:
                 audio = audio[:, 0]
 
-            if sample_rate != config_target_sample_rate:
-                logger.error(
-                    f"sample rate error: {sample_rate}, need {self.sr} ")
-                sys.exit(-1)
             logger.info(f"audio shape: {audio.shape}")
             # fbank
             audio = preprocessing(audio, **preprocess_args)
@@ -313,11 +307,11 @@ class ASRExecutor(BaseExecutor):
         """
         return self._outputs["result"]
 
-    def _check(self, audio_file: str, model_sample_rate: int):
-        self.target_sample_rate = model_sample_rate
-        if self.target_sample_rate != 16000 and self.target_sample_rate != 8000:
+    def _check(self, audio_file: str, sample_rate: int):
+        self.sample_rate = sample_rate
+        if self.sample_rate != 16000 and self.sample_rate != 8000:
             logger.error(
-                "please input --model_sample_rate 8000 or --model_sample_rate 16000"
+                "please input --sr 8000 or --sr 16000"
             )
             raise Exception("invalid sample rate")
             sys.exit(-1)
@@ -328,7 +322,7 @@ class ASRExecutor(BaseExecutor):
 
         logger.info("checking the audio file format......")
         try:
-            sig, sample_rate = soundfile.read(
+            audio, audio_sample_rate = soundfile.read(
                 audio_file, dtype="int16", always_2d=True)
         except Exception as e:
             logger.error(str(e))
@@ -342,15 +336,15 @@ class ASRExecutor(BaseExecutor):
                  sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
                  ")
             sys.exit(-1)
-        logger.info("The sample rate is %d" % sample_rate)
-        if sample_rate != self.target_sample_rate:
+        logger.info("The sample rate is %d" % audio_sample_rate)
+        if audio_sample_rate != self.sample_rate:
             logger.warning(
                 "The sample rate of the input file is not {}.\n \
                             The program will resample the wav file to {}.\n \
                             If the result does not meet your expectations，\n \
                             Please input the 16k 16bit 1 channel wav file. \
                         "
-                .format(self.target_sample_rate, self.target_sample_rate))
+                .format(self.sample_rate, self.sample_rate))
             while (True):
                 logger.info(
                     "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
@@ -381,14 +375,14 @@ class ASRExecutor(BaseExecutor):
 
         model = parser_args.model
         lang = parser_args.lang
-        model_sample_rate = parser_args.model_sample_rate
+        sample_rate = parser_args.sr
         config = parser_args.config
         ckpt_path = parser_args.ckpt_path
         audio_file = parser_args.input
         device = parser_args.device
 
         try:
-            res = self(model, lang, model_sample_rate, config, ckpt_path,
+            res = self(model, lang, sample_rate, config, ckpt_path,
                        audio_file, device)
             logger.info('ASR Result: {}'.format(res))
             return True
@@ -396,14 +390,14 @@ class ASRExecutor(BaseExecutor):
             print(e)
             return False
 
-    def __call__(self, model, lang, model_sample_rate, config, ckpt_path,
+    def __call__(self, model, lang, sample_rate, config, ckpt_path,
                  audio_file, device):
         """
             Python API to call an executor.
         """
         audio_file = os.path.abspath(audio_file)
-        self._check(audio_file, model_sample_rate)
-        self._init_from_path(model, lang, model_sample_rate, config, ckpt_path,
+        self._check(audio_file, sample_rate)
+        self._init_from_path(model, lang, sample_rate, config, ckpt_path,
                              device)
         self.preprocess(model, audio_file)
         self.infer(model)

From 581a545c696c4b5ce6cd898b33bd16ce756c6557 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 2 Dec 2021 11:33:11 +0800
Subject: [PATCH 47/53] Update RESULTS.md

fix table header
---
 examples/librispeech/asr2/RESULTS.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/librispeech/asr2/RESULTS.md b/examples/librispeech/asr2/RESULTS.md
index 41655565d..4cb2d2872 100644
--- a/examples/librispeech/asr2/RESULTS.md
+++ b/examples/librispeech/asr2/RESULTS.md
@@ -3,9 +3,9 @@
 
 ## Transformer
 
-| Model | Params | GPUS | Averaged Model | Config | Augmentation| Loss |
-| --- | --- | --- | --- | --- | --- |  
-| transformer | 32.52 M | 8 Tesla V100-SXM2-32GB | 10-best val_loss | conf/transformer.yaml | spec_aug | 6.3197922706604 |
+| Model | Params | GPUS | Averaged Model | Config | Augmentation| Loss |  
+| :-: | :-: | :------------: | :------------: | :-: | :-: | :-: |     
+| transformer | 32.52M | 8 Tesla V100-SXM2-32GB | 10-best val_loss | conf/transformer.yaml | spec_aug | 6.3197922706604 |  
 
 ### Attention Rescore
 

From a9d206c1bfc433f1aec6cebbb783f8539e0bb6a9 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 2 Dec 2021 05:58:20 +0000
Subject: [PATCH 48/53] revise

---
 paddlespeech/cli/asr/infer.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 6ae038539..e9d8c0b11 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -22,6 +22,7 @@ import librosa
 import paddle
 import soundfile
 from yacs.config import CfgNode
+import numpy as np
 
 from ..executor import BaseExecutor
 from ..utils import cli_register
@@ -81,6 +82,7 @@ class ASRExecutor(BaseExecutor):
             "--sr",
             type=int,
             default=16000,
+            choices=[8000, 16000],
             help='Choose the audio sample rate of the model. 8000 or 16000')
         self.parser.add_argument(
             '--config',
@@ -131,13 +133,13 @@ class ASRExecutor(BaseExecutor):
             self.cfg_path = os.path.join(res_path,
                                          pretrained_models[tag]['cfg_path'])
             self.ckpt_path = os.path.join(res_path,
-                                          pretrained_models[tag]['ckpt_path'])
+                                          pretrained_models[tag]['ckpt_path'] + ".pdparams")
             logger.info(res_path)
             logger.info(self.cfg_path)
             logger.info(self.ckpt_path)
         else:
             self.cfg_path = os.path.abspath(cfg_path)
-            self.ckpt_path = os.path.abspath(ckpt_path)
+            self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
             res_path = os.path.dirname(
                 os.path.dirname(os.path.abspath(self.cfg_path)))
 
@@ -183,8 +185,7 @@ class ASRExecutor(BaseExecutor):
         self.model.eval()
 
         # load model
-        params_path = self.ckpt_path + ".pdparams"
-        model_dict = paddle.load(params_path)
+        model_dict = paddle.load(self.ckpt_path)
         self.model.set_state_dict(model_dict)
 
     def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
@@ -231,7 +232,7 @@ class ASRExecutor(BaseExecutor):
                 audio = librosa.resample(audio, audio_sample_rate,
                                          self.sample_rate)
                 audio_sample_rate = self.sample_rate
-                audio = audio.astype("int16")
+                audio = np.round(audio).astype("int16")
             else:
                 audio = audio[:, 0]
 

From b0356ae4892c85984804ecc1fda1f9cf4d5018ac Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 2 Dec 2021 05:58:20 +0000
Subject: [PATCH 49/53] revise

---
 paddlespeech/cli/asr/infer.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 6ae038539..640cf729f 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -22,6 +22,7 @@ import librosa
 import paddle
 import soundfile
 from yacs.config import CfgNode
+import numpy as np
 
 from ..executor import BaseExecutor
 from ..utils import cli_register
@@ -81,6 +82,7 @@ class ASRExecutor(BaseExecutor):
             "--sr",
             type=int,
             default=16000,
+            choices=[8000, 16000],
             help='Choose the audio sample rate of the model. 8000 or 16000')
         self.parser.add_argument(
             '--config',
@@ -131,13 +133,13 @@ class ASRExecutor(BaseExecutor):
             self.cfg_path = os.path.join(res_path,
                                          pretrained_models[tag]['cfg_path'])
             self.ckpt_path = os.path.join(res_path,
-                                          pretrained_models[tag]['ckpt_path'])
+                                          pretrained_models[tag]['ckpt_path'] + ".pdparams")
             logger.info(res_path)
             logger.info(self.cfg_path)
             logger.info(self.ckpt_path)
         else:
             self.cfg_path = os.path.abspath(cfg_path)
-            self.ckpt_path = os.path.abspath(ckpt_path)
+            self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
             res_path = os.path.dirname(
                 os.path.dirname(os.path.abspath(self.cfg_path)))
 
@@ -183,8 +185,7 @@ class ASRExecutor(BaseExecutor):
         self.model.eval()
 
         # load model
-        params_path = self.ckpt_path + ".pdparams"
-        model_dict = paddle.load(params_path)
+        model_dict = paddle.load(self.ckpt_path)
         self.model.set_state_dict(model_dict)
 
     def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
@@ -227,11 +228,16 @@ class ASRExecutor(BaseExecutor):
                     audio = audio.mean(axis=1)
                 else:
                     audio = audio[:, 0]
+                # pcm16 -> pcm 32
                 audio = audio.astype("float32")
+                bits = np.iinfo(np.int16).bits
+                audio = audio / (2**(bits - 1))
                 audio = librosa.resample(audio, audio_sample_rate,
                                          self.sample_rate)
                 audio_sample_rate = self.sample_rate
-                audio = audio.astype("int16")
+                # pcm16 -> pcm 32
+                audio = audio * (2**(bits - 1))
+                audio = np.round(audio).astype("int16")
             else:
                 audio = audio[:, 0]
 
@@ -341,7 +347,7 @@ class ASRExecutor(BaseExecutor):
                 "The sample rate of the input file is not {}.\n \
                             The program will resample the wav file to {}.\n \
                             If the result does not meet your expectations，\n \
-                            Please input the 16k 16bit 1 channel wav file. \
+                            Please input the 16k 16 bit 1 channel wav file. \
                         "
                 .format(self.sample_rate, self.sample_rate))
             while (True):

From 8ec576f477603269a677d11ddd87a56163a656aa Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 2 Dec 2021 15:03:04 +0800
Subject: [PATCH 50/53] Update infer.py

---
 paddlespeech/cli/asr/infer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 640cf729f..66a2f169f 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -235,7 +235,7 @@ class ASRExecutor(BaseExecutor):
                 audio = librosa.resample(audio, audio_sample_rate,
                                          self.sample_rate)
                 audio_sample_rate = self.sample_rate
-                # pcm16 -> pcm 32
+                # pcm32 -> pcm 16
                 audio = audio * (2**(bits - 1))
                 audio = np.round(audio).astype("int16")
             else:

From a258a34ec037066d309e6dec01da48b35a3317eb Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 2 Dec 2021 07:22:12 +0000
Subject: [PATCH 51/53] revise the convert pcm

---
 paddlespeech/cli/asr/infer.py | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 66a2f169f..48772997a 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -225,19 +225,16 @@ class ASRExecutor(BaseExecutor):
 
             if self.change_format:
                 if audio.shape[1] >= 2:
-                    audio = audio.mean(axis=1)
+                    audio = audio.mean(axis=1, dtype=np.int16)
                 else:
                     audio = audio[:, 0]
                 # pcm16 -> pcm 32
-                audio = audio.astype("float32")
-                bits = np.iinfo(np.int16).bits
-                audio = audio / (2**(bits - 1))
+                audio = self._pcm16to32(audio)
                 audio = librosa.resample(audio, audio_sample_rate,
                                          self.sample_rate)
                 audio_sample_rate = self.sample_rate
                 # pcm32 -> pcm 16
-                audio = audio * (2**(bits - 1))
-                audio = np.round(audio).astype("int16")
+                audio = self._pcm32to16(audio)
             else:
                 audio = audio[:, 0]
 
@@ -312,6 +309,20 @@ class ASRExecutor(BaseExecutor):
         """
         return self._outputs["result"]
 
+    def _pcm16to32(self, audio):
+        assert(audio.dtype == np.int16)
+        audio = audio.astype("float32")
+        bits = np.iinfo(np.int16).bits
+        audio = audio / (2**(bits - 1))
+        return audio
+
+    def _pcm32to16(self, audio):
+        assert(audio.dtype == np.float32)
+        bits = np.iinfo(np.int16).bits
+        audio = audio * (2**(bits - 1))
+        audio = np.round(audio).astype("int16")
+        return audio
+
     def _check(self, audio_file: str, sample_rate: int):
         self.sample_rate = sample_rate
         if self.sample_rate != 16000 and self.sample_rate != 8000:

From 719d23c07a070bcda267fb137decacb2757d2b90 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 2 Dec 2021 08:06:21 +0000
Subject: [PATCH 52/53] revise the install.md, setup.py and makefile, rm the
 setup.sh

---
 docs/source/install.md | 164 +++++++++++++++++++++++++++++++++--------
 setup.py               |  13 ----
 setup.sh               |  20 -----
 tools/Makefile         |   4 +-
 4 files changed, 135 insertions(+), 66 deletions(-)
 delete mode 100644 setup.sh

diff --git a/docs/source/install.md b/docs/source/install.md
index 1057242ff..962cbc38a 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -1,53 +1,101 @@
 # Installation
 
-To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise follow the guidelines below to install the dependencies manually.
+There are 3 ways to use the repository. According to the degree of difficulty, the 3 ways can be divided into Easy, Medium and Hard.
+
+
+
+## Easy: Get the Basic Funcition Without Your Own Mechine
+
+If you are in touch with PaddleSpeech for the first time and want to experience it easily without your own mechine. We recommand you to go to aistudio to experience the PaddleSpeech project. There is a step-by-step tutorial for PaddleSpeech and you can use the basic function of PaddleSpeech with a free machine.
+
+
+
+## Prerequisites for Medium and Hard
 
-## Prerequisites
 - Python >= 3.7
 - PaddlePaddle latest version (please refer to the [Installation Guide](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/index_en.html))
+- Only Linux is supported
+- Hip: Do not use command `sh` instead of command `bash`
+
+
+
+## Medium: Get the Basic Funciton on Your Mechine
 
-## Simple Setup
+If you want to install the paddlespeech on your own mechine. There are 3 steps you need to do.
 
-For user who working on `Ubuntu` with `root`  privilege.
+### Install PaddlePaddle
 
 ```bash
-git clone https://github.com/PaddlePaddle/DeepSpeech.git
-cd PaddleSpeech
+python3 -m pip install paddlepaddle-gpu==2.2.0
 ```
 
-If you want to use the basic function of the repo, you can use:
+### Install the Conda
+
+The first setup is installing the conda. Conda is environment management system. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version (py>=3.7) and install it by yourself or you can use the scripts below:
+
 ```bash
-pip install .
+# download the miniconda
+wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+# install the miniconda
+bash Miniconda3-latest-Linux-x86_64.sh -b
+# conda init
+$HOME/miniconda3/bin/conda init
+# activate the conda
+bash
 ```
 
-If you want to do the development, you can use:
+Then you can create an conda virtual environment using the script:
+
+```bash
+conda create -n py37 python=3.7
 ```
-pip install -e .[develop]
+
+Activate the conda virtual environment:
+
+```bash
+conda activate py37
 ```
 
-For user who only needs the basic function of paddlespeech, using conda to do installing is recommended.
-You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version and install it by yourself, or you can use the scripts below to install the last miniconda version.
+Intall the conda dependencies
 
 ```bash
-pushd tools
-bash extras/install_miniconda.sh
-popd
-bash
+conda install -c conda-forge sox libsndfile swig bzip2 gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
 ```
 
-After installing the conda, run the setup.sh to complete the installing process.
+### Install the PaddleSpeech Using PiP
+
+To Install the PaddleSpeech, there are two methods. You can use the script below:
+
 ```bash
-bash setup.sh
+pip install paddlespeech
 ```
 
+If you install the paddlespeech by pip, you can use it to help you to build your own model. However, you can not use the ready-made examples in paddlespeech. 
 
-## Setup (Other Platform)
+If you want to use the ready-made examples in paddlespeech, you need to clone the repository and install the paddlespeech package.
 
-- Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.
-- The version of `swig` should >= 3.0
-- we will do more to simplify the install process.
+```bash
+https://github.com/PaddlePaddle/PaddleSpeech.git
+## Into the PaddleSpeech
+cd PaddleSpeech
+pip install .
+```
+
+
+
+## Hard: Get the Full Funciton on Your Mechine
+
+### Prerequisites
+
+- choice 1: working with `ubuntu` Docker Container.
+
+  or
 
-## Running in Docker Container (optional)
+- choice 2: working on `Ubuntu` with `root` privilege. 
+
+To avoid the trouble of environment setup, [running in Docker container](#running-in-docker-container) is highly recommended. Otherwise If you work on `Ubuntu` with `root` privilege, you can skip the next step.
+
+### Choice 1: Running in Docker Container (Recommand)
 
 Docker is an open source tool to build, ship, and run distributed applications in an isolated environment. A Docker image for this project has been provided in [hub.docker.com](https://hub.docker.com) with all the dependencies installed. This Docker image requires the support of NVIDIA GPU, so please make sure its availiability and the [nvidia-docker](https://github.com/NVIDIA/nvidia-docker) has been installed.
 
@@ -55,22 +103,22 @@ Take several steps to launch the Docker image:
 
 - Download the Docker image
 
-For example, pull paddle 2.0.0 image:
+For example, pull paddle 2.2.0 image:
 
 ```bash
-nvidia-docker pull registry.baidubce.com/paddlepaddle/paddle:2.0.0-gpu-cuda10.1-cudnn7
+nvidia-docker pull registry.baidubce.com/paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7
 ```
 
 - Clone this repository
 
 ```
-git clone https://github.com/PaddlePaddle/DeepSpeech.git
+git clone https://github.com/PaddlePaddle/PaddleSpeech.git
 ```
 
 - Run the Docker image
 
 ```bash
-sudo nvidia-docker run --rm -it -v $(pwd)/DeepSpeech:/DeepSpeech registry.baidubce.com/paddlepaddle/paddle:2.0.0-gpu-cuda10.1-cudnn7 /bin/bash
+sudo nvidia-docker run --rm -it -v $(pwd)/PaddleSpeech:/PaddleSpeech registry.baidubce.com/paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash
 ```
 
 Now you can execute training, inference and hyper-parameters tuning in the Docker container.
@@ -78,12 +126,66 @@ Now you can execute training, inference and hyper-parameters tuning in the Docke
 
 - Install PaddlePaddle
 
-For example, for CUDA 10.1, CuDNN7.5 install paddle 2.0.0:
+For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0:
+
+```bash
+python3 -m pip install paddlepaddle-gpu==2.2.0
+```
+
+
+### Choice 2: Running in Ubuntu with Root Privilege
+
+- Clone this repository
+
+```
+git clone https://github.com/PaddlePaddle/PaddleSpeech.git
+```
+
+Install paddle 2.2.0:
+
+```bash
+python3 -m pip install paddlepaddle-gpu==2.2.0
+```
+
+
+### Install the Conda
+
+```bash
+# download and install the miniconda
+pushd tools
+bash extras/install_miniconda.sh
+popd
+# use the "bash" command to make the conda environment works
+bash
+# create an conda virtual environment
+conda create -n py37 python=3.7
+# Activate the conda virtual environment:
+conda activate py37
+# Install the conda packags
+conda install -c conda-forge sox libsndfile swig bzip2 gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
+```
+
+### Get the Funcition for Developing PaddleSpeech
 
 ```bash
-python3 -m pip install paddlepaddle-gpu==2.0.0
+pip install .[develop]
 ```
 
-- Install Deepspeech
+### Install the Kaldi
 
-Please see [Setup](#setup)  section.
+```bash
+pushd tools
+bash extras/install_openblas.sh
+bash extras/install_kaldi.sh
+popd
+```
+
+
+
+
+## Setup for Other Platform 
+
+- Make sure these libraries or tools in [dependencies](./dependencies.md) installed. More information please see: `setup.py `and ` tools/Makefile`.
+- The version of `swig` should >= 3.0
+- we will do more to simplify the install process.
+- Install Paddlespeech
\ No newline at end of file
diff --git a/setup.py b/setup.py
index b8c6b3aaf..ca5b35899 100644
--- a/setup.py
+++ b/setup.py
@@ -124,25 +124,12 @@ def _post_install(install_lib_dir):
         check_call("make")
     print("tools install.")
 
-    # install autolog
-    tools_extrs_dir = HERE / 'tools/extras'
-    with pushd(tools_extrs_dir):
-        print(os.getcwd())
-        check_call("./install_autolog.sh")
-    print("autolog install.")
     # ctcdecoder
     ctcdecoder_dir = HERE / 'paddlespeech/s2t/decoders/ctcdecoder/swig'
     with pushd(ctcdecoder_dir):
         check_call("bash -e setup.sh")
     print("ctcdecoder install.")
 
-    # install third_party
-    third_party_dir = HERE / 'third_party'
-    with pushd(third_party_dir):
-        check_call("bash -e install.sh")
-    print("third_party install.")
-
-
 class DevelopCommand(develop):
     def run(self):
         develop.run(self)
diff --git a/setup.sh b/setup.sh
deleted file mode 100644
index c2ed16d7a..000000000
--- a/setup.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-# Install conda dependencies
-conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
-
-# Install the python lib
-pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
-
-# Install the auto_log
-pushd tools/extras
-bash install_autolog.sh
-popd
-
-# Install the ctcdecoder
-pushd paddlespeech/s2t/decoders/ctcdecoder/swig
-bash -e setup.sh
-popd
-
-# Install the python_speech_features
-pushd third_party
-bash -e install.sh
-popd
diff --git a/tools/Makefile b/tools/Makefile
index 1e0c2dfbf..5b2cbe8d2 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -10,7 +10,7 @@ WGET ?= wget --no-check-certificate
 
 .PHONY: all clean
 
-all: virtualenv.done apt.done kenlm.done sox.done mfa.done sclite.done
+all: apt.done kenlm.done mfa.done sclite.done
 
 virtualenv.done:
 	test -d venv || virtualenv -p $(PYTHON) venv
@@ -35,7 +35,7 @@ kenlm.done:
 	apt-get install -y gcc-5 g++-5 && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 50  && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-5 50
 	test -d kenlm || $(WGET) -O - https://kheafield.com/code/kenlm.tar.gz | tar xz
 	rm -rf kenlm/build && mkdir -p kenlm/build && cd kenlm/build && cmake .. && make -j4 && make install
-	cd kenlm && python setup.py install
+	cd kenlm && python3 setup.py install
 	touch kenlm.done
 
 sox.done:

From 829b7758def2ecbe63e6c1ba7c93fc836d43ddd1 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 2 Dec 2021 09:09:08 +0000
Subject: [PATCH 53/53] revise

---
 docs/source/install.md | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/docs/source/install.md b/docs/source/install.md
index 962cbc38a..d276759b4 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -23,12 +23,6 @@ If you are in touch with PaddleSpeech for the first time and want to experience
 
 If you want to install the paddlespeech on your own mechine. There are 3 steps you need to do.
 
-### Install PaddlePaddle
-
-```bash
-python3 -m pip install paddlepaddle-gpu==2.2.0
-```
-
 ### Install the Conda
 
 The first setup is installing the conda. Conda is environment management system. You can go to [minicoda](https://docs.conda.io/en/latest/miniconda.html) to select a version (py>=3.7) and install it by yourself or you can use the scripts below:
@@ -62,6 +56,14 @@ Intall the conda dependencies
 conda install -c conda-forge sox libsndfile swig bzip2 gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
 ```
 
+### Install PaddlePaddle
+
+For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0:
+
+```bash
+python3 -m pip install paddlepaddle-gpu==2.2.0
+```
+
 ### Install the PaddleSpeech Using PiP
 
 To Install the PaddleSpeech, there are two methods. You can use the script below:
@@ -118,21 +120,11 @@ git clone https://github.com/PaddlePaddle/PaddleSpeech.git
 - Run the Docker image
 
 ```bash
-sudo nvidia-docker run --rm -it -v $(pwd)/PaddleSpeech:/PaddleSpeech registry.baidubce.com/paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash
+sudo nvidia-docker run --net=host --ipc=host --rm -it -v $(pwd)/PaddleSpeech:/PaddleSpeech registry.baidubce.com/paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash
 ```
 
 Now you can execute training, inference and hyper-parameters tuning in the Docker container.
 
-
-- Install PaddlePaddle
-
-For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0:
-
-```bash
-python3 -m pip install paddlepaddle-gpu==2.2.0
-```
-
-
 ### Choice 2: Running in Ubuntu with Root Privilege
 
 - Clone this repository
@@ -165,13 +157,21 @@ conda activate py37
 conda install -c conda-forge sox libsndfile swig bzip2 gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
 ```
 
+### Install PaddlePaddle
+
+For example, for CUDA 10.2, CuDNN7.5 install paddle 2.2.0:
+
+```bash
+python3 -m pip install paddlepaddle-gpu==2.2.0
+```
+
 ### Get the Funcition for Developing PaddleSpeech
 
 ```bash
-pip install .[develop]
+pip install -e .[develop]
 ```
 
-### Install the Kaldi
+### Install the Kaldi (Optional)
 
 ```bash
 pushd tools