Merge branch 'PaddlePaddle:develop' into diffusion

3 years ago · 50837f5a5b
parent e820352741 6728db5b59
commit 50837f5a5b
28 changed files with 68 additions and 91 deletions
--- a/.github/stale.yml
+++ b/.github/stale.yml
@ -6,7 +6,8 @@ daysUntilClose: 30
 exemptLabels:
  - Roadmap 
  - Bug
-  - New Feature 
+  - feature request
+  - Tips
 # Label to use when marking an issue as stale
 staleLabel: Stale
 # Comment to post when marking an issue as stale. Set to `false` to disable
@ -17,4 +18,4 @@ markComment: >
 unmarkComment: false
 # Comment to post when closing a stale issue. Set to `false` to disable
 closeComment: >
-  This issue is closed. Please re-open if needed.
+  This issue is closed. Please re-open if needed.
--- a/audio/setup.py
+++ b/audio/setup.py
@ -40,14 +40,9 @@ COMMITID = 'none'
 base = [
    "kaldiio",
    "librosa==0.8.1",
-    "scipy>=1.0.0",
-    "soundfile~=0.10",
-    "colorlog",
-    "pathos==0.2.8",
+    "pathos",
    "pybind11",
    "parameterized",
-    "tqdm",
-    "scikit-learn"
 ]

 requirements = {
@ -273,7 +268,7 @@ def main():
        },

        # Package info
-        packages=find_packages(include=('paddleaudio*')),
+        packages=find_packages(include=['paddleaudio*']),
        package_data=lib_package_data,
        ext_modules=setup_helpers.get_ext_modules(),
        zip_safe=True,
--- a/demos/speech_web/speech_server/requirements.txt
+++ b/demos/speech_web/speech_server/requirements.txt
@ -1,8 +1,6 @@
 aiofiles
 faiss-cpu
-praatio==5.0.0
+praatio>=5.0.0
 pydantic
 python-multipart
-scikit_learn
 starlette
-uvicorn
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,11 +1,9 @@
 braceexpand
 editdistance
-fastapi
 g2p_en
 g2pM
 h5py
 inflect
-jieba
 jsonlines
 kaldiio
 keyboard
@ -24,30 +22,23 @@ paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
 pattern_singleton
-Pillow>=9.0.0
 ppdiffusers>=0.9.0
-praatio==5.0.0
+praatio>=5.0.0
 prettytable
 pypinyin-dict
 pypinyin<=0.44.0
 python-dateutil
-pyworld==0.2.12
+pyworld>=0.2.12
 recommonmark>=0.5.0
-resampy==0.2.2
+resampy
 sacrebleu
-scipy
-sentencepiece~=0.1.96
-soundfile~=0.10
 sphinx
 sphinx-autobuild
 sphinx-markdown-tables
 sphinx_rtd_theme
 textgrid
 timer
-tqdm
 typeguard
-uvicorn
-visualdl
 webrtcvad
 websockets
 yacs~=0.1.8
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@ -147,14 +147,14 @@ optional arguments:

 The pretrained model can be downloaded here:

- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true)
+- [vits_csmsc_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.4.0.zip) (add_blank=true)

 VITS checkpoint contains files listed below.
 ```text
-vits_csmsc_ckpt_1.1.0
-├── default.yaml              # default config used to train vitx
-├── phone_id_map.txt          # phone vocabulary file when training vits
-└── snapshot_iter_333000.pdz  # model parameters and optimizer states
+vits_csmsc_ckpt_1.4.0
+├── default.yaml                    # default config used to train vitx
+├── phone_id_map.txt                # phone vocabulary file when training vits
+└── snapshot_iter_150000.pdz  # model parameters and optimizer states
 ```

 ps: This ckpt is not good enough, a better result is training
@ -168,9 +168,9 @@ add_blank=true
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize_e2e.py \
-    --config=vits_csmsc_ckpt_1.1.0/default.yaml \
-    --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_333000.pdz \
-    --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \
+    --config=vits_csmsc_ckpt_1.4.0/default.yaml \
+    --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
+    --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
    --output_dir=exp/default/test_e2e \
    --text=${BIN_DIR}/../sentences.txt \
    --add-blank=${add_blank} 
--- a/examples/tess/cls0/local/train.py
+++ b/examples/tess/cls0/local/train.py
@ -121,7 +121,7 @@ if __name__ == "__main__":
            optimizer.clear_grad()

            # Calculate loss
-            avg_loss += loss.numpy()[0]
+            avg_loss += float(loss)

            # Calculate metrics
            preds = paddle.argmax(logits, axis=1)
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@ -127,7 +127,7 @@ class TextExecutor(BaseExecutor):
        if self.task == 'punc':
            # punc list
            self._punc_list = []
-            with open(self.vocab_file, 'r') as f:
+            with open(self.vocab_file, 'r', encoding='utf-8') as f:
                for line in f:
                    self._punc_list.append(line.strip())

@ -178,12 +178,12 @@ class TextExecutor(BaseExecutor):
        if self.task == 'punc':
            # punc list
            self._punc_list = []
-            with open(self.vocab_file, 'r') as f:
+            with open(self.vocab_file, 'r', encoding='utf-8') as f:
                for line in f:
                    self._punc_list.append(line.strip())

            # model
-            with open(self.cfg_path) as f:
+            with open(self.cfg_path, 'r', encoding='utf-8') as f:
                config = CfgNode(yaml.safe_load(f))
            self.model = ErnieLinear(**config["model"])

--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor):
        with open(self.voc_config) as f:
            self.voc_config = CfgNode(yaml.safe_load(f))

-        with open(self.phones_dict, "r") as f:
+        with open(self.phones_dict, 'rt', encoding='utf-8') as f:
            phn_id = [line.strip().split() for line in f.readlines()]
        vocab_size = len(phn_id)

        tone_size = None
        if self.tones_dict:
-            with open(self.tones_dict, "r") as f:
+            with open(self.tones_dict, 'rt', encoding='utf-8') as f:
                tone_id = [line.strip().split() for line in f.readlines()]
            tone_size = len(tone_id)

        spk_num = None
        if self.speaker_dict:
-            with open(self.speaker_dict, 'rt') as f:
+            with open(self.speaker_dict, 'rt', encoding='utf-8') as f:
                spk_id = [line.strip().split() for line in f.readlines()]
            spk_num = len(spk_id)

--- a/paddlespeech/cli/whisper/infer.py
+++ b/paddlespeech/cli/whisper/infer.py
@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor):
        Init model and other resources from a specific path.
        """
        logger.debug("start to init the model")
-        # default max_len: unit:second
-        self.max_len = 50
+
        if hasattr(self, 'model'):
            logger.debug('Model had been initialized.')
            return
@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor):
        try:
            audio, audio_sample_rate = soundfile.read(
                audio_file, dtype="int16", always_2d=True)
-            audio_duration = audio.shape[0] / audio_sample_rate
-            if audio_duration > self.max_len:
-                logger.error(
-                    f"Please input audio file less then {self.max_len} seconds.\n"
-                )
-                return False
        except Exception as e:
            logger.exception(e)
            logger.error(
--- a/paddlespeech/s2t/models/whisper/init.py
+++ b/paddlespeech/s2t/models/whisper/init.py
@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
 from paddlespeech.s2t.models.whisper.whipser import decode
--- a/paddlespeech/s2t/models/whisper/tokenizer.py
+++ b/paddlespeech/s2t/models/whisper/tokenizer.py
@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py)
 import os
--- a/paddlespeech/s2t/models/whisper/utils.py
+++ b/paddlespeech/s2t/models/whisper/utils.py
@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py)
 import zlib
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper)
 import os
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@ -437,7 +437,7 @@ if __name__ == '__main__':

    vocab_phones = {}

-    with open(args.phones_dict, 'rt') as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    for phn, id in phn_id:
        vocab_phones[phn] = int(id)
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@ -109,7 +109,7 @@ def train_sp(args, config):
        num_workers=config.num_workers)
    print("dataloaders done!")

-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@ -67,7 +67,7 @@ def train_sp(args, config):
    if args.speaker_dict is not None:
        print("multiple speaker fastspeech2!")
        collate_fn = fastspeech2_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
            spk_id = [line.strip().split() for line in f.readlines()]
        spk_num = len(spk_id)
        fields += ["spk_id"]
@ -123,7 +123,7 @@ def train_sp(args, config):
        num_workers=config.num_workers)
    print("dataloaders done!")

-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config):

    # construct dataset for evaluation
    sentences = []
-    with open(args.text, 'rt') as f:
+    with open(args.text, 'rt', encoding='utf-8') as f:
        for line in f:
            items = line.strip().split()
            utt_id = items[0]
            sentence = "".join(items[1:])
            sentences.append((utt_id, sentence))

-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
        tone_id = [line.strip().split() for line in f.readlines()]
    tone_size = len(tone_id)
    print("tone_size:", tone_size)
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@ -70,7 +70,7 @@ def train_sp(args, config):
    if args.speaker_dict is not None:
        print("multiple speaker speedyspeech!")
        collate_fn = speedyspeech_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
            spk_id = [line.strip().split() for line in f.readlines()]
        spk_num = len(spk_id)
        fields += ["spk_id"]
@ -133,11 +133,11 @@ def train_sp(args, config):
        collate_fn=collate_fn,
        num_workers=config.num_workers)
    print("dataloaders done!")
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
        tone_id = [line.strip().split() for line in f.readlines()]
    tone_size = len(tone_id)
    print("tone_size:", tone_size)
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int):
 def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
    # construct dataset for evaluation
    sentences = []
-    with open(text_file, 'rt') as f:
+    with open(text_file, 'rt', encoding='utf-8') as f:
        for line in f:
            if line.strip() != "":
                items = re.split(r"\s+", line.strip(), 1)
@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc',
                     tones_dict: Optional[os.PathLike]=None,
                     speaker_dict: Optional[os.PathLike]=None,
                     return_am: bool=False):
-    with open(phones_dict, "r") as f:
+    with open(phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    tone_size = None
    if tones_dict is not None:
-        with open(tones_dict, "r") as f:
+        with open(tones_dict, 'rt', encoding='utf-8') as f:
            tone_id = [line.strip().split() for line in f.readlines()]
        tone_size = len(tone_id)
    spk_num = None
    if speaker_dict is not None:
-        with open(speaker_dict, 'rt') as f:
+        with open(speaker_dict, 'rt', encoding='utf-8') as f:
            spk_id = [line.strip().split() for line in f.readlines()]
        spk_num = len(spk_id)
    odim = am_config.n_mels
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@ -119,7 +119,7 @@ def train_sp(args, config):
        num_workers=config.num_workers)
    print("dataloaders done!")

-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@ -114,7 +114,7 @@ def train_sp(args, config):
        num_workers=config.num_workers)
    print("dataloaders done!")

-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@ -78,7 +78,7 @@ def train_sp(args, config):
    if args.speaker_dict is not None:
        print("multiple speaker vits!")
        collate_fn = vits_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
            spk_id = [line.strip().split() for line in f.readlines()]
        spk_num = len(spk_id)
        fields += ["spk_id"]
@ -132,7 +132,7 @@ def train_sp(args, config):
        num_workers=config.num_workers)
    print("dataloaders done!")

-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
        phn_id = [line.strip().split() for line in f.readlines()]
    vocab_size = len(phn_id)
    print("vocab_size:", vocab_size)
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@ -58,7 +58,7 @@ class English(Phonetics):
        self.punc = "：，；。？！“”‘’':,;.?!"
        self.text_normalizer = TextNormalizer()
        if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                phn_id = [line.strip().split() for line in f.readlines()]
            for phn, id in phn_id:
                self.vocab_phones[phn] = int(id)
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@ -144,12 +144,12 @@ class Frontend():
        self.vocab_phones = {}
        self.vocab_tones = {}
        if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                phn_id = [line.strip().split() for line in f.readlines()]
            for phn, id in phn_id:
                self.vocab_phones[phn] = int(id)
        if tone_vocab_path:
-            with open(tone_vocab_path, 'rt') as f:
+            with open(tone_vocab_path, 'rt', encoding='utf-8') as f:
                tone_id = [line.strip().split() for line in f.readlines()]
            for tone, id in tone_id:
                self.vocab_tones[tone] = int(id)
--- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater):
        loss.backward()
        optimizer.step()

+        if self.use_guided_attn_loss:
+            report("train/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
+        
        report("train/l1_loss", float(l1_loss))
        report("train/mse_loss", float(mse_loss))
        report("train/bce_loss", float(bce_loss))
-        report("train/attn_loss", float(attn_loss))
        report("train/loss", float(loss))

        losses_dict["l1_loss"] = float(l1_loss)
        losses_dict["mse_loss"] = float(mse_loss)
        losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
        losses_dict["loss"] = float(loss)
        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_dict.items())
@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator):
            attn_loss = self.attn_loss(
                att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
            loss = loss + attn_loss
+        
+        if self.use_guided_attn_loss:
+            report("eval/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)

        report("eval/l1_loss", float(l1_loss))
        report("eval/mse_loss", float(mse_loss))
        report("eval/bce_loss", float(bce_loss))
-        report("eval/attn_loss", float(attn_loss))
        report("eval/loss", float(loss))

        losses_dict["l1_loss"] = float(l1_loss)
        losses_dict["mse_loss"] = float(mse_loss)
        losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
        losses_dict["loss"] = float(loss)
        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_dict.items())
--- a/setup.py
+++ b/setup.py
@ -37,9 +37,7 @@ base = [
    "g2pM",
    "h5py",
    "inflect",
-    "jieba",
    "jsonlines",
-    "kaldiio",
    "librosa==0.8.1",
    "loguru",
    "matplotlib",
@ -51,22 +49,16 @@ base = [
    "paddlenlp>=2.4.8",
    "ppdiffusers>=0.9.0",
    "paddlespeech_feat",
-    "Pillow>=9.0.0",
-    "praatio==5.0.0",
+    "praatio>=5.0.0",
    "pypinyin<=0.44.0",
    "pypinyin-dict",
    "python-dateutil",
-    "pyworld==0.2.12",
-    "resampy==0.2.2",
+    "pyworld>=0.2.12",
+    "resampy",
    "sacrebleu",
-    "scipy",
-    "sentencepiece~=0.1.96",
-    "soundfile~=0.10",
    "textgrid",
    "timer",
-    "tqdm",
    "typeguard",
-    "visualdl",
    "webrtcvad",
    "yacs~=0.1.8",
    "prettytable",
@ -74,10 +66,10 @@ base = [
    "braceexpand",
    "pyyaml",
    "paddleslim>=2.3.4",
-    "paddleaudio>=1.0.2",
+    "paddleaudio>=1.1.0",
 ]

-server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
+server = ["pattern_singleton", "websockets"]

 requirements = {
    "install":
@ -300,7 +292,7 @@ setup_info = dict(
    },

    # Package info
-    packages=find_packages(include=('paddlespeech*')),
+    packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']),
    zip_safe=True,
    classifiers=[
        'Development Status :: 5 - Production/Stable',
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then
        mkdir -p BZNSYP
        unrar x BZNSYP.rar BZNSYP
        wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
+        # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住
+        wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz
+        tar -xzf nltk_data.tar.gz -C ${HOME}
        # 数据预处理
        python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml
        python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@ -129,7 +129,7 @@ decoders_module = [

 setup(
    name='paddlespeech_ctcdecoders',
-    version='0.2.0',
+    version='0.2.2',
    description="CTC decoders in paddlespeech",
    author="PaddlePaddle Speech and Language Team",
    author_email="paddlesl@baidu.com",