fix #2013; and format

3 years ago · dfdf450b22
parent 69a6da4c16
commit dfdf450b22
44 changed files with 124 additions and 146 deletions
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
 import json
 import os
--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -14,7 +14,3 @@
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@ -145,4 +145,3 @@ for com, info in _commands.items():
        name='paddlespeech.{}'.format(com),
        description=info[0],
        cls='paddlespeech.cli.{}.{}'.format(com, info[1]))
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@ -21,12 +21,12 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
 from paddleaudio import load
 from paddleaudio.features import LogMelSpectrogram
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
 from paddleaudio import load
 from paddleaudio.features import LogMelSpectrogram
 __all__ = ['CLSExecutor']
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@ -22,13 +22,13 @@ from typing import Union
 import paddle
 import soundfile
 from paddleaudio.backends import load as load_audio
 from paddleaudio.compliance.librosa import melspectrogram
 from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
 from paddleaudio.backends import load as load_audio
 from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
--- a/paddlespeech/resource/model_alias.py
+++ b/paddlespeech/resource/model_alias.py
@ -22,8 +22,7 @@ model_alias = {
    # -------------- ASR --------------
    # ---------------------------------
    "deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
-    "deepspeech2online":
+    "deepspeech2online": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
    ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
    "conformer": ["paddlespeech.s2t.models.u2:U2Model"],
    "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"],
    "transformer": ["paddlespeech.s2t.models.u2:U2Model"],
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@ -76,7 +76,8 @@ class CTCPrefixScorePD():
        last_ids = [yi[-1] for yi in y]  # last output label ids
        n_bh = len(last_ids)  # batch * hyps
        n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
-        self.scoring_num = paddle.shape(scoring_ids)[-1] if scoring_ids is not None else 0
+        self.scoring_num = paddle.shape(scoring_ids)[
            -1] if scoring_ids is not None else 0
        # prepare state info
        if state is None:
            r_prev = paddle.full(
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@ -22,11 +22,9 @@ import numpy as np
 import paddle
 from paddle import distributed as dist
 from paddle import inference
 from paddle.io import DataLoader
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.io.dataset import ManifestDataset
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
@ -238,8 +236,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def __init__(self, config, args):
        super().__init__(config, args)
        self._text_featurizer = TextFeaturizer(
-            unit_type=config.unit_type,
+            unit_type=config.unit_type, vocab=config.vocab_filepath)
            vocab=config.vocab_filepath)
        self.vocab_list = self._text_featurizer.vocab_list
    def ordid2token(self, texts, texts_len):
@ -248,7 +245,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        for text, n in zip(texts, texts_len):
            n = n.numpy().item()
            ids = text[:n]
-            trans.append(self._text_featurizer.defeaturize(ids.numpy().tolist()))
+            trans.append(
                self._text_featurizer.defeaturize(ids.numpy().tolist()))
        return trans
    def compute_metrics(self,
--- a/paddlespeech/s2t/models/ds2/init.py
+++ b/paddlespeech/s2t/models/ds2/init.py
@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 from .deepspeech2 import DeepSpeech2InferModel
 from .deepspeech2 import DeepSpeech2Model
 from paddlespeech.s2t.utils import dynamic_pip_install
 import sys
 try:
    import paddlespeech_ctcdecoders
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@ -372,11 +372,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
-    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box=None,
+    def forward(self,
                audio_chunk,
                audio_chunk_lens,
                chunk_state_h_box=None,
                chunk_state_c_box=None):
        if self.encoder.rnn_direction == "forward":
            eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
-                audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
+                audio_chunk, audio_chunk_lens, chunk_state_h_box,
                chunk_state_c_box)
            probs_chunk = self.decoder.softmax(eouts_chunk)
            return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
        elif self.encoder.rnn_direction == "bidirect":
@ -392,8 +396,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                self,
                input_spec=[
                    paddle.static.InputSpec(
-                        shape=[None, None,
+                        shape=[None, None, self.encoder.feat_size
-                               self.encoder.feat_size],  #[B, chunk_size, feat_dim]
+                               ],  #[B, chunk_size, feat_dim]
                        dtype='float32'),
                    paddle.static.InputSpec(shape=[None],
                                            dtype='int64'),  # audio_length, [B]
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
    def _target_mask(self, ys_in_pad):
        ys_mask = ys_in_pad != 0
-        m = subsequent_mask(paddle.shape(ys_mask)[-1])).unsqueeze(0)
+        m = subsequent_mask(paddle.shape(ys_mask)[-1]).unsqueeze(0)
        return ys_mask.unsqueeze(-2) & m
    def forward(self, x: paddle.Tensor, t: paddle.Tensor
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from contextlib import nullcontext
 import paddle
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 from typing import Union
 import paddle
@ -22,7 +23,6 @@ from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.loss import CTCLoss
 from paddlespeech.s2t.utils import ctc_utils
 from paddlespeech.s2t.utils.log import Log
 import sys
 logger = Log(__name__).getlog()
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@ -82,7 +82,8 @@ def pad_sequence(sequences: List[paddle.Tensor],
    max_size = paddle.shape(sequences[0])
    # (TODO Hui Zhang): slice not supprot `end==start`
    # trailing_dims = max_size[1:]
-    trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
+    trailing_dims = tuple(
        max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
    max_len = max([s.shape[0] for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@ -55,7 +55,7 @@ class PaddleASRConnectionHanddler:
        self.config = asr_engine.config  # server config
        self.model_config = asr_engine.executor.config
        self.asr_engine = asr_engine
-    
+
        # model_type, sample_rate and text_feature is shared for deepspeech2 and conformer
        self.model_type = self.asr_engine.executor.model_type
        self.sample_rate = self.asr_engine.executor.sample_rate
@ -191,7 +191,7 @@ class PaddleASRConnectionHanddler:
        self.num_frames = 0
        ## endpoint
-        self.endpoint_state = False # True for detect endpoint
+        self.endpoint_state = False  # True for detect endpoint
        ## conformer
        self.model_reset()
@ -503,11 +503,13 @@ class PaddleASRConnectionHanddler:
        # endpoint
        if not is_finished:
            def contain_nonsilence():
                return len(self.hyps) > 0 and len(self.hyps[0]) > 0
            decoding_something = contain_nonsilence()
-            if self.endpointer.endpoint_detected(ctc_probs.numpy(), decoding_something):
+            if self.endpointer.endpoint_detected(ctc_probs.numpy(),
                                                 decoding_something):
                self.endpoint_state = True
                logger.info(f"Endpoint is detected at {self.num_frames} frame.")
@ -869,7 +871,6 @@ class ASREngine(BaseEngine):
        logger.info("Initialize ASR server engine successfully.")
        return True
    def new_handler(self):
        """New handler from model.
--- a/paddlespeech/server/engine/asr/online/ctc_endpoint.py
+++ b/paddlespeech/server/engine/asr/online/ctc_endpoint.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import List
+
 import numpy as np
 from paddlespeech.cli.log import logger
@ -76,12 +76,11 @@ class OnlineCTCEndpoint:
            decoding_something or (not rule.must_contain_nonsilence)
        ) and trailine_silence >= rule.min_trailing_silence and utterance_length >= rule.min_utterance_length
        if (ans):
-            logger.info(
+            logger.info(f"Endpoint Rule: {rule_name} activated: {rule}")
                f"Endpoint Rule: {rule_name} activated: {rule}"
            )
        return ans
-    def endpoint_detected(self, ctc_log_probs: np.ndarray,
+    def endpoint_detected(self,
                          ctc_log_probs: np.ndarray,
                          decoding_something: bool) -> bool:
        """detect endpoint.
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@ -42,7 +42,6 @@ class TTSServerExecutor(TTSExecutor):
        self.task_resource = CommonTaskResource(
            task='tts', model_format='dynamic', inference_mode='online')
    def get_model_info(self,
                       field: str,
                       model_name: str,
--- a/paddlespeech/server/ws/asr_api.py
+++ b/paddlespeech/server/ws/asr_api.py
@ -19,7 +19,6 @@ from fastapi import WebSocketDisconnect
 from starlette.websockets import WebSocketState as WebSocketState
 from paddlespeech.cli.log import logger
 from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler
 from paddlespeech.server.engine.engine_pool import get_engine_pool
 router = APIRouter()
@ -106,7 +105,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    logger.info("endpoint: detected and rescoring.")
                    connection_handler.rescoring()
                    word_time_stamp = connection_handler.get_word_time_stamp()
-                
+
                asr_results = connection_handler.get_result()
                if connection_handler.endpoint_state:
@ -124,7 +123,7 @@ async def websocket_endpoint(websocket: WebSocket):
                        }
                        await websocket.send_json(resp)
                        break
- 
+
                # return the current partial result
                # if the engine create the vad instance, this connection will have many partial results 
                resp = {'result': asr_results}
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@ -140,10 +140,7 @@ def parse_args():
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
-        '--am_config',
+        '--am_config', type=str, default=None, help='Config of acoustic model.')
        type=str,
        default=None,
        help='Config of acoustic model.')
    parser.add_argument(
        '--am_ckpt',
        type=str,
@ -179,10 +176,7 @@ def parse_args():
        ],
        help='Choose vocoder type of tts task.')
    parser.add_argument(
-        '--voc_config',
+        '--voc_config', type=str, default=None, help='Config of voc.')
        type=str,
        default=None,
        help='Config of voc.')
    parser.add_argument(
        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
    parser.add_argument(
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -174,10 +174,7 @@ def parse_args():
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
-        '--am_config',
+        '--am_config', type=str, default=None, help='Config of acoustic model.')
        type=str,
        default=None,
        help='Config of acoustic model.')
    parser.add_argument(
        '--am_ckpt',
        type=str,
@ -220,10 +217,7 @@ def parse_args():
        ],
        help='Choose vocoder type of tts task.')
    parser.add_argument(
-        '--voc_config',
+        '--voc_config', type=str, default=None, help='Config of voc.')
        type=str,
        default=None,
        help='Config of voc.')
    parser.add_argument(
        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
    parser.add_argument(
--- a/paddlespeech/t2s/exps/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@ -131,10 +131,7 @@ def parse_args():
        choices=['fastspeech2_aishell3', 'tacotron2_aishell3'],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
-        '--am_config',
+        '--am_config', type=str, default=None, help='Config of acoustic model.')
        type=str,
        default=None,
        help='Config of acoustic model.')
    parser.add_argument(
        '--am_ckpt',
        type=str,
@ -160,10 +157,7 @@ def parse_args():
        help='Choose vocoder type of tts task.')
    parser.add_argument(
-        '--voc_config',
+        '--voc_config', type=str, default=None, help='Config of voc.')
        type=str,
        default=None,
        help='Config of voc.')
    parser.add_argument(
        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
    parser.add_argument(
--- a/paddlespeech/t2s/models/vits/init.py
+++ b/paddlespeech/t2s/models/vits/init.py
@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .vits import *
-from .vits_updater import *
+from .vits_updater import *
--- a/paddlespeech/t2s/models/vits/vits_updater.py
+++ b/paddlespeech/t2s/models/vits/vits_updater.py
@ -56,7 +56,8 @@ class VITSUpdater(StandardUpdater):
        self.models: Dict[str, Layer] = models
        # self.model = model
-        self.model = model._layers if isinstance(model, paddle.DataParallel) else model
+        self.model = model._layers if isinstance(model,
                                                 paddle.DataParallel) else model
        self.optimizers = optimizers
        self.optimizer_g: Optimizer = optimizers['generator']
@ -225,7 +226,8 @@ class VITSEvaluator(StandardEvaluator):
        models = {"main": model}
        self.models: Dict[str, Layer] = models
        # self.model = model
-        self.model = model._layers if isinstance(model, paddle.DataParallel) else model
+        self.model = model._layers if isinstance(model,
                                                 paddle.DataParallel) else model
        self.criterions = criterions
        self.criterion_mel = criterions['mel']
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -971,18 +971,18 @@ class FeatureMatchLoss(nn.Layer):
        return feat_match_loss
 # loss for VITS
 class KLDivergenceLoss(nn.Layer):
    """KL divergence loss."""
    def forward(
-        self,
+            self,
-        z_p: paddle.Tensor,
+            z_p: paddle.Tensor,
-        logs_q: paddle.Tensor,
+            logs_q: paddle.Tensor,
-        m_p: paddle.Tensor,
+            m_p: paddle.Tensor,
-        logs_p: paddle.Tensor,
+            logs_p: paddle.Tensor,
-        z_mask: paddle.Tensor,
+            z_mask: paddle.Tensor, ) -> paddle.Tensor:
    ) -> paddle.Tensor:
        """Calculate KL divergence loss.
        Args:
@ -1002,8 +1002,8 @@ class KLDivergenceLoss(nn.Layer):
        logs_p = paddle.cast(logs_p, 'float32')
        z_mask = paddle.cast(z_mask, 'float32')
        kl = logs_p - logs_q - 0.5
-        kl += 0.5 * ((z_p - m_p) ** 2) * paddle.exp(-2.0 * logs_p)
+        kl += 0.5 * ((z_p - m_p)**2) * paddle.exp(-2.0 * logs_p)
        kl = paddle.sum(kl * z_mask)
        loss = kl / paddle.sum(z_mask)
-        return loss
+        return loss
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@ -25,4 +25,3 @@ netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host
 > Reminder: Only for developer, make sure you know what's it.
 * codelab - for speechx developer, using for test.
--- a/speechx/examples/ds2_ol/README.md
+++ b/speechx/examples/ds2_ol/README.md
@ -3,4 +3,4 @@
 ## Examples
 * `websocket` - Streaming ASR with websocket for deepspeech2_aishell.    
-* `aishell` - Streaming Decoding under aishell dataset, for local WER test.    
+* `aishell` - Streaming Decoding under aishell dataset, for local WER test.    
--- a/speechx/speechx/codelab/README.md
+++ b/speechx/speechx/codelab/README.md
@ -4,4 +4,3 @@
 > Reminder: Only for developer.
 * codelab - for speechx developer, using for test.
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@ -91,8 +91,8 @@ int main(int argc, char* argv[]) {
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data));
-     int32 chunk_size = FLAGS_receptive_field_length
+    int32 chunk_size = FLAGS_receptive_field_length +
-        + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
    int32 receptive_field_length = FLAGS_receptive_field_length;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@ -64,7 +64,7 @@ std::string TLGDecoder::GetPartialResult() {
        std::string word = word_symbol_table_->Find(words_id[idx]);
        words += word;
    }
-    return words; 
+    return words;
 }
 std::string TLGDecoder::GetFinalBestPath() {
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@ -82,7 +82,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
    opts.assembler_opts.subsampling_rate = FLAGS_downsampling_rate;
    opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length;
    opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
-    
+
    return opts;
 }
--- a/speechx/speechx/decoder/tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/tlg_decoder_main.cc
@ -93,8 +93,8 @@ int main(int argc, char* argv[]) {
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
-    int32 chunk_size = FLAGS_receptive_field_length
+    int32 chunk_size = FLAGS_receptive_field_length +
-        + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
    int32 receptive_field_length = FLAGS_receptive_field_length;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@ -24,7 +24,8 @@ using std::unique_ptr;
 Assembler::Assembler(AssemblerOptions opts,
                     unique_ptr<FrontendInterface> base_extractor) {
    frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk;
-    frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate + opts.receptive_filed_length;
+    frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate +
                        opts.receptive_filed_length;
    receptive_filed_length_ = opts.receptive_filed_length;
    base_extractor_ = std::move(base_extractor);
    dim_ = base_extractor_->Dim();
@ -50,8 +51,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
        Vector<BaseFloat> feature;
        result = base_extractor_->Read(&feature);
        if (result == false || feature.Dim() == 0) {
-          if (IsFinished() == false) return false;
+            if (IsFinished() == false) return false;
-          break;
+            break;
        }
        feature_cache_.push(feature);
    }
@ -61,22 +62,22 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
    }
    while (feature_cache_.size() < frame_chunk_size_) {
-       Vector<BaseFloat> feature(dim_, kaldi::kSetZero); 
+        Vector<BaseFloat> feature(dim_, kaldi::kSetZero);
-       feature_cache_.push(feature);
+        feature_cache_.push(feature);
    }
-    int32 counter = 0; 
+    int32 counter = 0;
    int32 cache_size = frame_chunk_size_ - frame_chunk_stride_;
    int32 elem_dim = base_extractor_->Dim();
    while (counter < frame_chunk_size_) {
-      Vector<BaseFloat>& val = feature_cache_.front();
+        Vector<BaseFloat>& val = feature_cache_.front();
-      int32 start = counter * elem_dim;
+        int32 start = counter * elem_dim;
-      feats->Range(start, elem_dim).CopyFromVec(val);
+        feats->Range(start, elem_dim).CopyFromVec(val);
-      if (frame_chunk_size_ - counter <= cache_size ) {
+        if (frame_chunk_size_ - counter <= cache_size) {
-          feature_cache_.push(val);
+            feature_cache_.push(val);
-      }
+        }
-      feature_cache_.pop();
+        feature_cache_.pop();
-      counter++;
+        counter++;
    }
    return result;
--- a/speechx/speechx/frontend/audio/assembler.h
+++ b/speechx/speechx/frontend/audio/assembler.h
@ -25,7 +25,7 @@ struct AssemblerOptions {
    int32 receptive_filed_length;
    int32 subsampling_rate;
    int32 nnet_decoder_chunk;
-    
+
    AssemblerOptions()
        : receptive_filed_length(1),
          subsampling_rate(1),
@ -47,15 +47,11 @@ class Assembler : public FrontendInterface {
    // feat dim
    virtual size_t Dim() const { return dim_; }
-    virtual void SetFinished() {
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
        base_extractor_->SetFinished();
    }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
-    virtual void Reset() {
+    virtual void Reset() { base_extractor_->Reset(); }
        base_extractor_->Reset();
    }
  private:
    bool Compute(kaldi::Vector<kaldi::BaseFloat>* feats);
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@ -30,7 +30,7 @@ class AudioCache : public FrontendInterface {
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
-    // the audio dim is 1, one sample, which is useless, 
+    // the audio dim is 1, one sample, which is useless,
    // so we return size_(cache samples) instead.
    virtual size_t Dim() const { return size_; }
--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@ -29,19 +29,19 @@ using kaldi::Matrix;
 using std::vector;
 FbankComputer::FbankComputer(const Options& opts)
-    : opts_(opts),
+    : opts_(opts), computer_(opts) {}
    computer_(opts) {}
 int32 FbankComputer::Dim() const {
    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
 }
 bool FbankComputer::NeedRawLogEnergy() {
-    return opts_.use_energy && opts_.raw_energy; 
+    return opts_.use_energy && opts_.raw_energy;
 }
 // Compute feat
-bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) {
+bool FbankComputer::Compute(Vector<BaseFloat>* window,
                            Vector<BaseFloat>* feat) {
    RealFft(window, true);
    kaldi::ComputePowerSpectrum(window);
    const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@ -72,9 +72,9 @@ bool FeatureCache::Compute() {
    bool result = base_extractor_->Read(&feature);
    if (result == false || feature.Dim() == 0) return false;
-    int32 num_chunk = feature.Dim() / dim_ ;
+    int32 num_chunk = feature.Dim() / dim_;
    for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
-        int32 start = chunk_idx *  dim_;
+        int32 start = chunk_idx * dim_;
        Vector<BaseFloat> feature_chunk(dim_);
        SubVector<BaseFloat> tmp(feature.Data() + start, dim_);
        feature_chunk.CopyFromVec(tmp);
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@ -22,9 +22,7 @@ namespace ppspeech {
 struct FeatureCacheOptions {
    int32 max_size;
    int32 timeout;  // ms
-    FeatureCacheOptions()
+    FeatureCacheOptions() : max_size(kint16max), timeout(1) {}
        : max_size(kint16max),
          timeout(1) {}
 };
 class FeatureCache : public FrontendInterface {
--- a/speechx/speechx/frontend/audio/feature_common.h
+++ b/speechx/speechx/frontend/audio/feature_common.h
@ -23,11 +23,11 @@ template <class F>
 class StreamingFeatureTpl : public FrontendInterface {
  public:
    typedef typename F::Options Options;
-    StreamingFeatureTpl(const Options& opts, 
+    StreamingFeatureTpl(const Options& opts,
                        std::unique_ptr<FrontendInterface> base_extractor);
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
-    
+
    // the dim_ is the dim of single frame feature
    virtual size_t Dim() const { return computer_.Dim(); }
@ -39,8 +39,9 @@ class StreamingFeatureTpl : public FrontendInterface {
        base_extractor_->Reset();
        remained_wav_.Resize(0);
    }
  private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, 
+    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
                 kaldi::Vector<kaldi::BaseFloat>* feats);
    Options opts_;
    std::unique_ptr<FrontendInterface> base_extractor_;
--- a/speechx/speechx/frontend/audio/feature_common_inl.h
+++ b/speechx/speechx/frontend/audio/feature_common_inl.h
@ -16,16 +16,15 @@
 namespace ppspeech {
 template <class F>
-StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts, 
+StreamingFeatureTpl<F>::StreamingFeatureTpl(
-                        std::unique_ptr<FrontendInterface> base_extractor):
+    const Options& opts, std::unique_ptr<FrontendInterface> base_extractor)
-                        opts_(opts),
+    : opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
                        computer_(opts),
                        window_function_(opts.frame_opts) {
    base_extractor_ = std::move(base_extractor);
 }
 template <class F>
-void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
+void StreamingFeatureTpl<F>::Accept(
    const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
    base_extractor_->Accept(waves);
 }
@ -58,8 +57,9 @@ bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
 // Compute feat
 template <class F>
-bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
+bool StreamingFeatureTpl<F>::Compute(
-                                     kaldi::Vector<kaldi::BaseFloat>* feats) {
+    const kaldi::Vector<kaldi::BaseFloat>& waves,
    kaldi::Vector<kaldi::BaseFloat>* feats) {
    const kaldi::FrameExtractionOptions& frame_opts =
        computer_.GetFrameOptions();
    int32 num_samples = waves.Dim();
@ -84,9 +84,11 @@ bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& wave
                             &window,
                             need_raw_log_energy ? &raw_log_energy : NULL);
-        kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
+        kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(),
                                                     kaldi::kUndefined);
        computer_.Compute(&window, &this_feature);
-        kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
+        kaldi::SubVector<kaldi::BaseFloat> output_row(
            feats->Data() + frame * Dim(), Dim());
        output_row.CopyFromVec(this_feature);
    }
    return true;
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@ -16,6 +16,7 @@
 #pragma once
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/audio_cache.h"
 #include "frontend/audio/data_cache.h"
 #include "frontend/audio/fbank.h"
@ -23,7 +24,6 @@
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
 #include "frontend/audio/normalizer.h"
 #include "frontend/audio/assembler.h"
 namespace ppspeech {
--- a/speechx/speechx/frontend/audio/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc
@ -28,22 +28,21 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;
-LinearSpectrogramComputer::LinearSpectrogramComputer(
+LinearSpectrogramComputer::LinearSpectrogramComputer(const Options& opts)
    const Options& opts)
    : opts_(opts) {
    kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts);
    int32 window_size = opts.frame_opts.WindowSize();
    frame_length_ = window_size;
    dim_ = window_size / 2 + 1;
-    BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window,
+    BaseFloat hanning_window_energy = kaldi::VecVec(
-                                          feature_window_function.window);
+        feature_window_function.window, feature_window_function.window);
    int32 sample_rate = opts.frame_opts.samp_freq;
    scale_ = 2.0 / (hanning_window_energy * sample_rate);
 }
 // Compute spectrogram feat
 bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window,
-                                Vector<BaseFloat>* feat) {
+                                        Vector<BaseFloat>* feat) {
    window->Resize(frame_length_, kaldi::kCopyData);
    RealFft(window, true);
    kaldi::ComputePowerSpectrum(window);
--- a/speechx/speechx/nnet/nnet_forward_main.cc
+++ b/speechx/speechx/nnet/nnet_forward_main.cc
@ -14,8 +14,8 @@
 #include "base/flags.h"
 #include "base/log.h"
 #include "frontend/audio/data_cache.h"
 #include "frontend/audio/assembler.h"
 #include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/paddle_nnet.h"
@ -75,8 +75,8 @@ int main(int argc, char* argv[]) {
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
-    int32 chunk_size = FLAGS_receptive_field_length 
+    int32 chunk_size = FLAGS_receptive_field_length +
-        + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
    int32 receptive_field_length = FLAGS_receptive_field_length;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
@ -130,7 +130,9 @@ int main(int argc, char* argv[]) {
            vector<kaldi::BaseFloat> prob;
            while (decodable->FrameLikelihood(frame_idx, &prob)) {
                kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
-                std::memcpy(vec_tmp.Data(), prob.data(), sizeof(kaldi::BaseFloat)*prob.size());
+                std::memcpy(vec_tmp.Data(),
                            prob.data(),
                            sizeof(kaldi::BaseFloat) * prob.size());
                prob_vec.push_back(vec_tmp);
                frame_idx++;
            }
@ -142,7 +144,8 @@ int main(int argc, char* argv[]) {
            KALDI_LOG << " the nnet prob of " << utt << " is empty";
            continue;
        }
-        kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),prob_vec[0].Dim());
+        kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),
                                               prob_vec[0].Dim());
        for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
            for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) {
                result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
--- a/speechx/speechx/protocol/websocket/websocket_client.h
+++ b/speechx/speechx/protocol/websocket/websocket_client.h
@ -40,8 +40,8 @@ class WebSocketClient {
    void SendEndSignal();
    void SendDataEnd();
    bool Done() const { return done_; }
-    std::string GetResult() const { return result_; } 
+    std::string GetResult() const { return result_; }
-    std::string GetPartialResult() const { return partial_result_;}
+    std::string GetPartialResult() const { return partial_result_; }
  private:
    void Connect();
--- a/speechx/speechx/protocol/websocket/websocket_server.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server.cc
@ -76,9 +76,10 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
    recognizer_->Accept(pcm_data);
    std::string partial_result = recognizer_->GetPartialResult();
-    
+
-    json::value rv = {
+    json::value rv = {{"status", "ok"},
-        {"status", "ok"}, {"type", "partial_result"}, {"result", partial_result}};
+                      {"type", "partial_result"},
                      {"result", partial_result}};
    ws_.text(true);
    ws_.write(asio::buffer(json::serialize(rv)));
 }
`@ -14,7 +14,3 @@`
	`import _locale`	`import _locale`

	`_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])`	`_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])`
`@ -25,4 +25,3 @@ netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host`
	`> Reminder: Only for developer, make sure you know what's it.`	`> Reminder: Only for developer, make sure you know what's it.`

	`* codelab - for speechx developer, using for test.`	`* codelab - for speechx developer, using for test.`
`@ -4,4 +4,3 @@`
	`> Reminder: Only for developer.`	`> Reminder: Only for developer.`

	`* codelab - for speechx developer, using for test.`	`* codelab - for speechx developer, using for test.`