diff --git a/examples/voxceleb/sv0/local/extract_speaker_embedding.py b/examples/voxceleb/sv0/local/extract_speaker_embedding.py
index 8eb24e1d..e7dad140 100644
--- a/examples/voxceleb/sv0/local/extract_speaker_embedding.py
+++ b/examples/voxceleb/sv0/local/extract_speaker_embedding.py
@@ -22,11 +22,11 @@ from paddle.io import BatchSampler
 from paddle.io import DataLoader
 from tqdm import tqdm
 
+from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets.voxceleb import VoxCeleb1
 from paddleaudio.features.core import melspectrogram
-from paddleaudio.backends import load as load_audio
-from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 from paddlespeech.vector.training.metrics import compute_eer
@@ -41,6 +41,7 @@ cpu_feat_conf = {
     'hop_length': 160,  #ms
 }
 
+
 def extract_audio_embedding(args):
     # stage 0: set the training device, cpu or gpu
     paddle.set_device(args.device)
@@ -59,6 +60,8 @@ def extract_audio_embedding(args):
     }
     ecapa_tdnn = EcapaTdnn(**model_conf)
 
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=1211)
     # stage 2: load the pre-trained model
     args.load_checkpoint = os.path.abspath(
         os.path.expanduser(args.load_checkpoint))
@@ -71,18 +74,29 @@ def extract_audio_embedding(args):
 
     # stage 3: we must set the model to eval mode
     model.eval()
-    
+
     # stage 4: read the audio data and extract the embedding
+    # wavform is one dimension numpy array 
     waveform, sr = load_audio(args.audio_path)
+
+    # feat type is numpy array, whose shape is [dim, time]
+    # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
+    # so the final shape is [1, dim, time]
     feat = melspectrogram(x=waveform, **cpu_feat_conf)
     feat = paddle.to_tensor(feat).unsqueeze(0)
-    lengths = paddle.ones([1]) # in paddle inference model, the lengths is all one without padding
-    feat = feature_normalize(feat, mean_norm=True, std_norm=False)
-    embedding = ecapa_tdnn(feat, lengths
-                ).squeeze().numpy() # (1, emb_size, 1) -> (emb_size)
+
+    # in inference period, the lengths is all one without padding
+    lengths = paddle.ones([1])
+    feat = feature_normalize(
+        feat, mean_norm=True, std_norm=False, convert_to_numpy=True)
+
+    # model backbone network forward the feats and get the embedding
+    embedding = model.backbone(
+        feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
 
     # stage 5: do global norm with external mean and std
     # todo
+    # np.save("audio-embedding", embedding)
     return embedding
 
 
diff --git a/examples/voxceleb/sv0/local/speaker_verification_cosine.py b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
index b0adcf66..417e8aa3 100644
--- a/examples/voxceleb/sv0/local/speaker_verification_cosine.py
+++ b/examples/voxceleb/sv0/local/speaker_verification_cosine.py
@@ -120,7 +120,7 @@ def main(args):
         **cpu_feat_conf)
     enrol_sampler = BatchSampler(
         enrol_ds, batch_size=args.batch_size,
-        shuffle=False)  # Shuffle to make embedding normalization more robust.
+        shuffle=True)  # Shuffle to make embedding normalization more robust.
     enrol_loader = DataLoader(enrol_ds,
                     batch_sampler=enrol_sampler,
                     collate_fn=lambda x: feature_normalize(
@@ -136,7 +136,7 @@ def main(args):
         **cpu_feat_conf)
 
     test_sampler = BatchSampler(
-        test_ds, batch_size=args.batch_size, shuffle=False)
+        test_ds, batch_size=args.batch_size, shuffle=True)
     test_loader = DataLoader(test_ds,
                             batch_sampler=test_sampler,
                             collate_fn=lambda x: feature_normalize(
diff --git a/examples/voxceleb/sv0/local/train.py b/examples/voxceleb/sv0/local/train.py
index 745d5eab..3fe67c8e 100644
--- a/examples/voxceleb/sv0/local/train.py
+++ b/examples/voxceleb/sv0/local/train.py
@@ -56,10 +56,10 @@ def main(args):
     # set the random seed, it is a must for multiprocess training
     seed_everything(args.seed)
 
-    # stage2: data prepare, such vox1 and vox2 data, and augment data and pipline
+    # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
     # note: some cmd must do in rank==0, so wo will refactor the data prepare code
-    train_ds = VoxCeleb1('train', target_dir=args.data_dir)
-    dev_ds = VoxCeleb1('dev', target_dir=args.data_dir)
+    train_dataset = VoxCeleb1('train', target_dir=args.data_dir)
+    dev_dataset = VoxCeleb1('dev', target_dir=args.data_dir)
 
     if args.augment:
         augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
@@ -123,9 +123,9 @@ def main(args):
 
     # stage8: we build the batch sampler for paddle.DataLoader
     train_sampler = DistributedBatchSampler(
-        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
+        train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=False)
     train_loader = DataLoader(
-        train_ds,
+        train_dataset,
         batch_sampler=train_sampler,
         num_workers=args.num_workers,
         collate_fn=waveform_collate_fn,
@@ -216,12 +216,12 @@ def main(args):
 
             # stage 9-12: construct the valid dataset dataloader
             dev_sampler = BatchSampler(
-                dev_ds,
+                dev_dataset,
                 batch_size=args.batch_size // 4,
                 shuffle=False,
                 drop_last=False)
             dev_loader = DataLoader(
-                dev_ds,
+                dev_dataset,
                 batch_sampler=dev_sampler,
                 collate_fn=waveform_collate_fn,
                 num_workers=args.num_workers,
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
index c3b31ce5..34a1cbd4 100755
--- a/examples/voxceleb/sv0/run.sh
+++ b/examples/voxceleb/sv0/run.sh
@@ -3,6 +3,8 @@
 set -e
 
 #######################################################################
+# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
+#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md
 # stage 1: train the speaker identification model
 # stage 2: test speaker identification 
 # stage 3: extract the training embeding to train the LDA and PLDA
@@ -12,23 +14,42 @@ set -e
 # default the dataset is the ~/.paddleaudio/
 # export PPAUDIO_HOME=
 
-stage=2
-dir=data/                     # data directory
-exp_dir=exp/ecapa-tdnn/       # experiment directory
+stage=0
+dir=data.bak/                     # data directory
+exp_dir=exp/ecapa-tdnn/           # experiment directory
 mkdir -p ${dir}
+mkdir -p ${exp_dir}
+
+# if [ $stage -le 0 ]; then 
+#      # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+#      # todo
+# fi 
 
 if [ $stage -le 1 ]; then
      # stage 1: train the speaker identification model
      python3 \
           -m paddle.distributed.launch --gpus=0,1,2,3 \
-          local/train.py --device "gpu" --checkpoint-dir ${exp_dir} \
-          --save-freq 10 --data-dir ${dir} --batch-size 256 --epochs 60
+          local/train.py --device "gpu" --checkpoint-dir ${exp_dir} --augment \
+          --save-freq 10 --data-dir ${dir} --batch-size 64 --epochs 100
 fi
 
 if [ $stage -le 2 ]; then
      # stage 1: train the speaker identification model
+     # you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
+     python3 \
+          local/speaker_verification_cosine.py\
+          --batch-size 4 --data-dir ${dir} --load-checkpoint ${exp_dir}/epoch_10/
+fi
+
+if [ $stage -le 3 ]; then
+     # stage 1: train the speaker identification model
+     # you can set the variable PPAUDIO_HOME to specifiy the downloaded the vox1 and vox2 dataset
      python3 \
-          local/speaker_verification_cosine.py \
-          --load-checkpoint ${exp_dir}/epoch_40/
+          local/extract_speaker_embedding.py\
+          --audio-path "demo/csv/00001.wav" --load-checkpoint ${exp_dir}/epoch_60/
 fi
 
+# if [ $stage -le 3 ]; then
+#      # stage 2: extract the training embeding to train the LDA and PLDA
+#      # todo: extract the training embedding
+# fi 
diff --git a/paddleaudio/datasets/voxceleb.py b/paddleaudio/datasets/voxceleb.py
index 28f6dfc6..c97e825e 100644
--- a/paddleaudio/datasets/voxceleb.py
+++ b/paddleaudio/datasets/voxceleb.py
@@ -28,7 +28,7 @@ from paddleaudio.backends import load as load_audio
 from paddleaudio.datasets.dataset import feat_funcs
 from paddleaudio.utils import DATA_HOME
 from paddleaudio.utils import decompress
-from paddleaudio.utils import download_and_decompress
+from paddlespeech.vector.utils.download import download_and_decompress
 from paddlespeech.s2t.utils.log import Log
 from utils.utility import download
 from utils.utility import unpack
@@ -106,13 +106,14 @@ class VoxCeleb1(Dataset):
         self.chunk_duration = chunk_duration
         self.split_ratio = split_ratio
         self.target_dir = target_dir if target_dir else self.base_path
-        self.csv_path = os.path.join(
+        VoxCeleb1.csv_path = os.path.join(
             target_dir, 'csv') if target_dir else os.path.join(self.base_path,
                                                                'csv')
-        self.meta_path = os.path.join(
+        VoxCeleb1.meta_path = os.path.join(
             target_dir, 'meta') if target_dir else os.path.join(self.base_path,
                                                                 'meta')
-        self.veri_test_file = os.path.join(self.meta_path, 'veri_test2.txt')
+        VoxCeleb1.veri_test_file = os.path.join(self.meta_path,
+                                                'veri_test2.txt')
         # self._data = self._get_data()[:1000]  # KP: Small dataset test.
         self._data = self._get_data()
         super(VoxCeleb1, self).__init__()
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
index 9db615f6..879cde3a 100644
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@@ -24,10 +24,19 @@ def waveform_collate_fn(batch):
 
 def feature_normalize(feats: paddle.Tensor,
                       mean_norm: bool=True,
-                      std_norm: bool=True):
+                      std_norm: bool=True,
+                      convert_to_numpy: bool=False):
     # Features normalization if needed
-    mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0
-    std = feats.std(axis=-1, keepdim=True) if std_norm else 1
-    feats = (feats - mean) / std
+    # numpy.mean is a little with paddle.mean about 1e-6
+    if convert_to_numpy:
+        feats_np = feats.numpy()
+        mean = feats_np.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feats_np.std(axis=-1, keepdims=True) if std_norm else 1
+        feats_np = (feats_np - mean) / std
+        feats = paddle.to_tensor(feats_np, dtype=feats.dtype)
+    else:
+        mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0
+        std = feats.std(axis=-1, keepdim=True) if std_norm else 1
+        feats = (feats - mean) / std
 
     return feats