From 4c3e57a23ccbe085f014cf31163b18dd70cac2a3 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 25 Jan 2022 06:33:24 +0000
Subject: [PATCH] align preprocess of wavernn, test=tts

---
 examples/csmsc/voc6/local/preprocess.sh       |  48 +++-
 examples/csmsc/voc6/local/synthesize.sh       |   3 +-
 examples/csmsc/voc6/local/train.sh            |   6 +-
 examples/csmsc/voc6/run.sh                    |   7 +-
 paddlespeech/t2s/datasets/vocoder_batch_fn.py | 216 +++++++++---------
 paddlespeech/t2s/exps/wavernn/preprocess.py   | 157 -------------
 paddlespeech/t2s/exps/wavernn/synthesize.py   |  61 +++--
 paddlespeech/t2s/exps/wavernn/train.py        |  36 ++-
 .../t2s/models/wavernn/wavernn_updater.py     |  36 ++-
 9 files changed, 250 insertions(+), 320 deletions(-)
 delete mode 100644 paddlespeech/t2s/exps/wavernn/preprocess.py

diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh
index 064aea557..2dcc39ac7 100755
--- a/examples/csmsc/voc6/local/preprocess.sh
+++ b/examples/csmsc/voc6/local/preprocess.sh
@@ -6,10 +6,50 @@ stop_stage=100
 config_path=$1
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/preprocess.py \
-        --input=~/datasets/BZNSYP/ \
-        --output=dump \
-        --dataset=csmsc \
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../gan_vocoder/preprocess.py \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dataset=baker \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
         --config=${config_path} \
+        --cut-sil=True \
         --num-cpu=20
 fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/csmsc/voc6/local/synthesize.sh b/examples/csmsc/voc6/local/synthesize.sh
index 876c8444e..7f0cbe48c 100755
--- a/examples/csmsc/voc6/local/synthesize.sh
+++ b/examples/csmsc/voc6/local/synthesize.sh
@@ -3,12 +3,11 @@
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
-test_input=$4
 
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize.py \
     --config=${config_path} \
     --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --input=${test_input} \
+    --test-metadata=dump/test/norm/metadata.jsonl \
     --output-dir=${train_output_path}/test
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
index 900450cdd..9695631ef 100755
--- a/examples/csmsc/voc6/local/train.sh
+++ b/examples/csmsc/voc6/local/train.sh
@@ -2,8 +2,12 @@
 
 config_path=$1
 train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
 python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
     --config=${config_path} \
-    --data=dump/ \
     --output-dir=${train_output_path} \
     --ngpu=1
diff --git a/examples/csmsc/voc6/run.sh b/examples/csmsc/voc6/run.sh
index bd32e3d2e..5f754fff3 100755
--- a/examples/csmsc/voc6/run.sh
+++ b/examples/csmsc/voc6/run.sh
@@ -9,7 +9,7 @@ stop_stage=100
 
 conf_path=conf/default.yaml
 train_output_path=exp/default
-test_input=dump/mel_test
+test_input=dump/dump_gta_test
 ckpt_name=snapshot_iter_100000.pdz
 
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
@@ -25,9 +25,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
 fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # copy some test mels from dump
-    mkdir -p ${test_input}
-    cp -r dump/mel/00995*.npy ${test_input}
     # synthesize
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} ${test_input}|| exit -1
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
index 496bf902a..b1d22db97 100644
--- a/paddlespeech/t2s/datasets/vocoder_batch_fn.py
+++ b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -12,11 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from pathlib import Path
 
 import numpy as np
 import paddle
-from paddle.io import Dataset
 
 
 def label_2_float(x, bits):
@@ -44,102 +42,6 @@ def decode_mu_law(y, mu, from_labels=True):
     return x
 
 
-class WaveRNNDataset(Dataset):
-    """A simple dataset adaptor for the processed ljspeech dataset."""
-
-    def __init__(self, root):
-        self.root = Path(root).expanduser()
-
-        records = []
-
-        with open(self.root / "metadata.csv", 'r') as rf:
-
-            for line in rf:
-                name = line.split("\t")[0]
-                mel_path = str(self.root / "mel" / (str(name) + ".npy"))
-                wav_path = str(self.root / "wav" / (str(name) + ".npy"))
-                records.append((mel_path, wav_path))
-
-        self.records = records
-
-    def __getitem__(self, i):
-        mel_name, wav_name = self.records[i]
-        mel = np.load(mel_name)
-        wav = np.load(wav_name)
-        return mel, wav
-
-    def __len__(self):
-        return len(self.records)
-
-
-class WaveRNNClip(object):
-    def __init__(self,
-                 mode: str='RAW',
-                 batch_max_steps: int=4500,
-                 hop_size: int=300,
-                 aux_context_window: int=2,
-                 bits: int=9):
-        self.mode = mode
-        self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window
-        self.batch_max_steps = batch_max_steps
-        self.hop_size = hop_size
-        self.aux_context_window = aux_context_window
-        if self.mode == 'MOL':
-            self.bits = 16
-        else:
-            self.bits = bits
-
-    def __call__(self, batch):
-        # batch: [mel, quant]
-        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
-        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
-        max_offsets = [
-            x[0].shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window)
-            for x in batch
-        ]
-        # the slice point of mel selecting randomly 
-        mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
-        # the slice point of wav selecting randomly, which is behind 2(=pad) frames 
-        sig_offsets = [(offset + self.aux_context_window) * self.hop_size
-                       for offset in mel_offsets]
-        # mels.sape[1] = voc_seq_len // hop_length + 2 * voc_pad
-        mels = [
-            x[0][:, mel_offsets[i]:mel_offsets[i] + self.mel_win]
-            for i, x in enumerate(batch)
-        ]
-        # label.shape[1] = voc_seq_len + 1
-        labels = [
-            x[1][sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1]
-            for i, x in enumerate(batch)
-        ]
-
-        mels = np.stack(mels).astype(np.float32)
-        labels = np.stack(labels).astype(np.int64)
-
-        mels = paddle.to_tensor(mels)
-        labels = paddle.to_tensor(labels, dtype='int64')
-
-        # x is input, y is label
-        x = labels[:, :self.batch_max_steps]
-        y = labels[:, 1:]
-        '''
-        mode = RAW:
-            mu_law = True:
-                quant: bits = 9   0, 1, 2, ..., 509, 510, 511  int
-            mu_law = False
-                quant bits = 9    [0， 511]  float
-        mode = MOL:
-            quant: bits = 16  [0. 65536]  float
-        '''
-        # x should be normalizes in.[0, 1] in RAW mode
-        x = label_2_float(paddle.cast(x, dtype='float32'), self.bits)
-        # y should be normalizes in.[0, 1] in MOL mode
-        if self.mode == 'MOL':
-            y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
-
-        return x, y, mels
-
-
 class Clip(object):
     """Collate functor for training vocoders.
     """
@@ -174,7 +76,7 @@ class Clip(object):
         self.end_offset = -(self.batch_max_frames + aux_context_window)
         self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
 
-    def __call__(self, examples):
+    def __call__(self, batch):
         """Convert into batch tensors.
 
         Parameters
@@ -192,11 +94,11 @@ class Clip(object):
 
         """
         # check length
-        examples = [
-            self._adjust_length(b['wave'], b['feats']) for b in examples
+        batch = [
+            self._adjust_length(b['wave'], b['feats']) for b in batch
             if b['feats'].shape[0] > self.mel_threshold
         ]
-        xs, cs = [b[0] for b in examples], [b[1] for b in examples]
+        xs, cs = [b[0] for b in batch], [b[1] for b in batch]
 
         # make batch with random cut
         c_lengths = [c.shape[0] for c in cs]
@@ -214,7 +116,7 @@ class Clip(object):
         c_batch = np.stack(
             [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
 
-        # convert each batch to tensor, asuume that each item in batch has the same length
+        # convert each batch to tensor, assume that each item in batch has the same length
         y_batch = paddle.to_tensor(
             y_batch, dtype=paddle.float32).unsqueeze(1)  # (B, 1, T)
         c_batch = paddle.to_tensor(
@@ -245,3 +147,111 @@ class Clip(object):
             0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
 
         return x, c
+
+
+class WaveRNNClip(Clip):
+    def __init__(self,
+                 mode: str='RAW',
+                 batch_max_steps: int=4500,
+                 hop_size: int=300,
+                 aux_context_window: int=2,
+                 bits: int=9,
+                 mu_law: bool=True):
+        self.mode = mode
+        self.mel_win = batch_max_steps // hop_size + 2 * aux_context_window
+        self.batch_max_steps = batch_max_steps
+        self.hop_size = hop_size
+        self.aux_context_window = aux_context_window
+        self.mu_law = mu_law
+        self.batch_max_frames = batch_max_steps // hop_size
+        self.mel_threshold = self.batch_max_frames + 2 * aux_context_window
+        if self.mode == 'MOL':
+            self.bits = 16
+        else:
+            self.bits = bits
+
+    def to_quant(self, wav):
+        if self.mode == 'RAW':
+            if self.mu_law:
+                quant = encode_mu_law(wav, mu=2**self.bits)
+            else:
+                quant = float_2_label(wav, bits=self.bits)
+        elif self.mode == 'MOL':
+            quant = float_2_label(wav, bits=16)
+        quant = quant.astype(np.int64)
+        return quant
+
+    def __call__(self, batch):
+        # voc_pad = 2  this will pad the input so that the resnet can 'see' wider than input length
+        # max_offsets = n_frames - 2 - (mel_win + 2 * hp.voc_pad) = n_frames - 15
+        """Convert into batch tensors.
+
+        Parameters
+        ----------
+        batch : list
+            list of tuple of the pair of audio and features. 
+            Audio shape (T, ), features shape(T', C).
+
+        Returns
+        ----------
+        Tensor
+            Auxiliary feature batch (B, C, T'), where
+            T = (T' - 2 * aux_context_window) * hop_size.
+        Tensor
+            Target signal batch (B, 1, T).
+
+        """
+        # check length
+        batch = [
+            self._adjust_length(b['wave'], b['feats']) for b in batch
+            if b['feats'].shape[0] > self.mel_threshold
+        ]
+        wav, mel = [b[0] for b in batch], [b[1] for b in batch]
+        # mel 此处需要转置
+        mel = [x.T for x in mel]
+        max_offsets = [
+            x.shape[-1] - 2 - (self.mel_win + 2 * self.aux_context_window)
+            for x in mel
+        ]
+        # the slice point of mel selecting randomly 
+        mel_offsets = [np.random.randint(0, offset) for offset in max_offsets]
+        # the slice point of wav selecting randomly, which is behind 2(=pad) frames 
+        sig_offsets = [(offset + self.aux_context_window) * self.hop_size
+                       for offset in mel_offsets]
+        # mels.shape[1] = voc_seq_len // hop_length + 2 * voc_pad
+        mels = [
+            x[:, mel_offsets[i]:mel_offsets[i] + self.mel_win]
+            for i, x in enumerate(mel)
+        ]
+        # label.shape[1] = voc_seq_len + 1
+        wav = [self.to_quant(x) for x in wav]
+
+        labels = [
+            x[sig_offsets[i]:sig_offsets[i] + self.batch_max_steps + 1]
+            for i, x in enumerate(wav)
+        ]
+
+        mels = np.stack(mels).astype(np.float32)
+        labels = np.stack(labels).astype(np.int64)
+
+        mels = paddle.to_tensor(mels)
+        labels = paddle.to_tensor(labels, dtype='int64')
+        # x is input, y is label
+        x = labels[:, :self.batch_max_steps]
+        y = labels[:, 1:]
+        '''
+        mode = RAW:
+            mu_law = True:
+                quant: bits = 9   0, 1, 2, ..., 509, 510, 511  int
+            mu_law = False
+                quant bits = 9    [0， 511]  float
+        mode = MOL:
+            quant: bits = 16  [0. 65536]  float
+        '''
+        # x should be normalizes in.[0, 1] in RAW mode
+        x = label_2_float(paddle.cast(x, dtype='float32'), self.bits)
+        # y should be normalizes in.[0, 1] in MOL mode
+        if self.mode == 'MOL':
+            y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
+
+        return x, y, mels
diff --git a/paddlespeech/t2s/exps/wavernn/preprocess.py b/paddlespeech/t2s/exps/wavernn/preprocess.py
deleted file mode 100644
index a26c6702a..000000000
--- a/paddlespeech/t2s/exps/wavernn/preprocess.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from multiprocessing import cpu_count
-from multiprocessing import Pool
-from pathlib import Path
-
-import librosa
-import numpy as np
-import pandas as pd
-import tqdm
-import yaml
-from yacs.config import CfgNode
-
-from paddlespeech.t2s.data.get_feats import LogMelFBank
-from paddlespeech.t2s.datasets import CSMSCMetaData
-from paddlespeech.t2s.datasets import LJSpeechMetaData
-from paddlespeech.t2s.datasets.vocoder_batch_fn import encode_mu_law
-from paddlespeech.t2s.datasets.vocoder_batch_fn import float_2_label
-
-
-class Transform(object):
-    def __init__(self, output_dir: Path, config):
-        self.fs = config.fs
-        self.peak_norm = config.peak_norm
-        self.bits = config.model.bits
-        self.mode = config.model.mode
-        self.mu_law = config.mu_law
-
-        self.wav_dir = output_dir / "wav"
-        self.mel_dir = output_dir / "mel"
-        self.wav_dir.mkdir(exist_ok=True)
-        self.mel_dir.mkdir(exist_ok=True)
-
-        self.mel_extractor = LogMelFBank(
-            sr=config.fs,
-            n_fft=config.n_fft,
-            hop_length=config.n_shift,
-            win_length=config.win_length,
-            window=config.window,
-            n_mels=config.n_mels,
-            fmin=config.fmin,
-            fmax=config.fmax)
-
-        if self.mode != 'RAW' and self.mode != 'MOL':
-            raise RuntimeError('Unknown mode value - ', self.mode)
-
-    def __call__(self, example):
-        wav_path, _, _ = example
-
-        base_name = os.path.splitext(os.path.basename(wav_path))[0]
-        # print("self.sample_rate:",self.sample_rate)
-        wav, _ = librosa.load(wav_path, sr=self.fs)
-        peak = np.abs(wav).max()
-        if self.peak_norm or peak > 1.0:
-            wav /= peak
-
-        mel = self.mel_extractor.get_log_mel_fbank(wav).T
-        if self.mode == 'RAW':
-            if self.mu_law:
-                quant = encode_mu_law(wav, mu=2**self.bits)
-            else:
-                quant = float_2_label(wav, bits=self.bits)
-        elif self.mode == 'MOL':
-            quant = float_2_label(wav, bits=16)
-
-        mel = mel.astype(np.float32)
-        audio = quant.astype(np.int64)
-
-        np.save(str(self.wav_dir / base_name), audio)
-        np.save(str(self.mel_dir / base_name), mel)
-
-        return base_name, mel.shape[-1], audio.shape[-1]
-
-
-def create_dataset(config,
-                   input_dir,
-                   output_dir,
-                   nprocs: int=1,
-                   dataset_type: str="ljspeech"):
-    input_dir = Path(input_dir).expanduser()
-    '''
-    LJSpeechMetaData.records: [filename, normalized text, speaker name(ljspeech)]
-    CSMSCMetaData.records: [filename, normalized text, pinyin]
-    '''
-    if dataset_type == 'ljspeech':
-        dataset = LJSpeechMetaData(input_dir)
-    else:
-        dataset = CSMSCMetaData(input_dir)
-    output_dir = Path(output_dir).expanduser()
-    output_dir.mkdir(exist_ok=True)
-
-    transform = Transform(output_dir, config)
-
-    file_names = []
-
-    pool = Pool(processes=nprocs)
-
-    for info in tqdm.tqdm(pool.imap(transform, dataset), total=len(dataset)):
-        base_name, mel_len, audio_len = info
-        file_names.append((base_name, mel_len, audio_len))
-
-    meta_data = pd.DataFrame.from_records(file_names)
-    meta_data.to_csv(
-        str(output_dir / "metadata.csv"), sep="\t", index=None, header=None)
-    print("saved meta data in to {}".format(
-        os.path.join(output_dir, "metadata.csv")))
-
-    print("Done!")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="create dataset")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
-
-    parser.add_argument(
-        "--input", type=str, help="path of the ljspeech dataset")
-    parser.add_argument(
-        "--output", type=str, help="path to save output dataset")
-    parser.add_argument(
-        "--num-cpu",
-        type=int,
-        default=cpu_count() // 2,
-        help="number of process.")
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="ljspeech",
-        help="The dataset to preprocess, ljspeech or csmsc")
-
-    args = parser.parse_args()
-
-    with open(args.config, 'rt') as f:
-        config = CfgNode(yaml.safe_load(f))
-
-    if args.dataset != "ljspeech" and args.dataset != "csmsc":
-        raise RuntimeError('Unknown dataset - ', args.dataset)
-
-    create_dataset(
-        config,
-        input_dir=args.input,
-        output_dir=args.output,
-        nprocs=args.num_cpu,
-        dataset_type=args.dataset)
diff --git a/paddlespeech/t2s/exps/wavernn/synthesize.py b/paddlespeech/t2s/exps/wavernn/synthesize.py
index e08c52b60..61723e039 100644
--- a/paddlespeech/t2s/exps/wavernn/synthesize.py
+++ b/paddlespeech/t2s/exps/wavernn/synthesize.py
@@ -15,13 +15,16 @@ import argparse
 import os
 from pathlib import Path
 
+import jsonlines
 import numpy as np
 import paddle
 import soundfile as sf
 import yaml
 from paddle import distributed as dist
+from timer import timer
 from yacs.config import CfgNode
 
+from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.models.wavernn import WaveRNN
 
 
@@ -30,10 +33,7 @@ def main():
 
     parser.add_argument("--config", type=str, help="GANVocoder config file.")
     parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
-    parser.add_argument(
-        "--input",
-        type=str,
-        help="path of directory containing mel spectrogram (in .npy format)")
+    parser.add_argument("--test-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
         "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
@@ -65,24 +65,43 @@ def main():
 
     model.eval()
 
-    mel_dir = Path(args.input).expanduser()
-    output_dir = Path(args.output_dir).expanduser()
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        metadata = list(reader)
+    test_dataset = DataTable(
+        metadata,
+        fields=['utt_id', 'feats'],
+        converters={
+            'utt_id': None,
+            'feats': np.load,
+        })
+    output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
-    for file_path in sorted(mel_dir.iterdir()):
-        mel = np.load(str(file_path))
-        mel = paddle.to_tensor(mel)
-        mel = mel.transpose([1, 0])
-        # input shape is (T', C_aux)
-        audio = model.generate(
-            c=mel,
-            batched=config.inference.gen_batched,
-            target=config.inference.target,
-            overlap=config.inference.overlap,
-            mu_law=config.mu_law,
-            gen_display=True)
-        audio_path = output_dir / (os.path.splitext(file_path.name)[0] + ".wav")
-        sf.write(audio_path, audio.numpy(), samplerate=config.fs)
-        print("[synthesize] {} -> {}".format(file_path, audio_path))
+
+    N = 0
+    T = 0
+    for example in test_dataset:
+        utt_id = example['utt_id']
+        mel = example['feats']
+        mel = paddle.to_tensor(mel)  # (T, C)
+        with timer() as t:
+            with paddle.no_grad():
+                wav = model.generate(
+                    c=mel,
+                    batched=config.inference.gen_batched,
+                    target=config.inference.target,
+                    overlap=config.inference.overlap,
+                    mu_law=config.mu_law,
+                    gen_display=True)
+            wav = wav.numpy()
+            N += wav.size
+            T += t.elapse
+            speed = wav.size / t.elapse
+            rtf = config.fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
 
 
 if __name__ == "__main__":
diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py
index d7bfc49bf..aec745f76 100644
--- a/paddlespeech/t2s/exps/wavernn/train.py
+++ b/paddlespeech/t2s/exps/wavernn/train.py
@@ -16,6 +16,8 @@ import os
 import shutil
 from pathlib import Path
 
+import jsonlines
+import numpy as np
 import paddle
 import yaml
 from paddle import DataParallel
@@ -25,9 +27,8 @@ from paddle.io import DistributedBatchSampler
 from paddle.optimizer import Adam
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.data import dataset
+from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNClip
-from paddlespeech.t2s.datasets.vocoder_batch_fn import WaveRNNDataset
 from paddlespeech.t2s.models.wavernn import WaveRNN
 from paddlespeech.t2s.models.wavernn import WaveRNNEvaluator
 from paddlespeech.t2s.models.wavernn import WaveRNNUpdater
@@ -56,10 +57,26 @@ def train_sp(args, config):
         f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
     )
 
-    wavernn_dataset = WaveRNNDataset(args.data)
-
-    train_dataset, dev_dataset = dataset.split(
-        wavernn_dataset, len(wavernn_dataset) - config.valid_size)
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
+
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=["wave", "feats"],
+        converters={
+            "wave": np.load,
+            "feats": np.load,
+        }, )
 
     batch_fn = WaveRNNClip(
         mode=config.model.mode,
@@ -92,7 +109,9 @@ def train_sp(args, config):
         collate_fn=batch_fn,
         batch_sampler=dev_sampler,
         num_workers=config.num_workers)
+
     valid_generate_loader = DataLoader(dev_dataset, batch_size=1)
+
     print("dataloaders done!")
 
     model = WaveRNN(
@@ -160,10 +179,11 @@ def train_sp(args, config):
 def main():
     # parse args and config and redirect to train_sp
 
-    parser = argparse.ArgumentParser(description="Train a WaveRNN model.")
+    parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
     parser.add_argument(
         "--config", type=str, help="config file to overwrite default config.")
-    parser.add_argument("--data", type=str, help="input")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
         "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
diff --git a/paddlespeech/t2s/models/wavernn/wavernn_updater.py b/paddlespeech/t2s/models/wavernn/wavernn_updater.py
index e6064e4cb..b2756d00c 100644
--- a/paddlespeech/t2s/models/wavernn/wavernn_updater.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn_updater.py
@@ -21,8 +21,6 @@ from paddle.io import DataLoader
 from paddle.nn import Layer
 from paddle.optimizer import Optimizer
 
-from paddlespeech.t2s.datasets.vocoder_batch_fn import decode_mu_law
-from paddlespeech.t2s.datasets.vocoder_batch_fn import label_2_float
 from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
 from paddlespeech.t2s.training.reporter import report
 from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
@@ -156,31 +154,22 @@ class WaveRNNEvaluator(StandardEvaluator):
 
         losses_dict["loss"] = float(loss)
 
-        self.iteration = ITERATION
-        if self.iteration % self.config.gen_eval_samples_interval_steps == 0:
-            self.gen_valid_samples()
-
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())
         self.logger.info(self.msg)
 
     def gen_valid_samples(self):
 
-        for i, (mel, wav) in enumerate(self.valid_generate_loader):
+        for i, item in enumerate(self.valid_generate_loader):
             if i >= self.config.generate_num:
-                print("before break")
                 break
             print(
                 '\n| Generating: {}/{}'.format(i + 1, self.config.generate_num))
-            wav = wav[0]
-            if self.mode == 'MOL':
-                bits = 16
-            else:
-                bits = self.config.model.bits
-            if self.config.mu_law and self.mode != 'MOL':
-                wav = decode_mu_law(wav, 2**bits, from_labels=True)
-            else:
-                wav = label_2_float(wav, bits)
+
+            mel = item['feats']
+            wav = item['wave']
+            wav = wav.squeeze(0)
+
             origin_save_path = self.valid_samples_dir / '{}_steps_{}_target.wav'.format(
                 self.iteration, i)
             sf.write(origin_save_path, wav.numpy(), samplerate=self.config.fs)
@@ -193,11 +182,20 @@ class WaveRNNEvaluator(StandardEvaluator):
             gen_save_path = str(self.valid_samples_dir /
                                 '{}_steps_{}_{}.wav'.format(self.iteration, i,
                                                             batch_str))
-            # (1, C_aux, T) -> (T, C_aux)
-            mel = mel.squeeze(0).transpose([1, 0])
+            # (1, T, C_aux) -> (T, C_aux)
+            mel = mel.squeeze(0)
             gen_sample = self.model.generate(
                 mel, self.config.inference.gen_batched,
                 self.config.inference.target, self.config.inference.overlap,
                 self.config.mu_law)
             sf.write(
                 gen_save_path, gen_sample.numpy(), samplerate=self.config.fs)
+
+    def __call__(self, trainer=None):
+        summary = self.evaluate()
+        for k, v in summary.items():
+            report(k, v)
+        # gen samples at then end of evaluate
+        self.iteration = ITERATION
+        if self.iteration % self.config.gen_eval_samples_interval_steps == 0:
+            self.gen_valid_samples()