fix #2013; and format

4 years ago · dfdf450b22
parent 69a6da4c16
commit dfdf450b22
44 changed files with 124 additions and 146 deletions
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import argparse
 import json
 import os
--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -14,7 +14,3 @@
 import _locale

 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
-
-
-
-
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@ -145,4 +145,3 @@ for com, info in _commands.items():
        name='paddlespeech.{}'.format(com),
        description=info[0],
        cls='paddlespeech.cli.{}.{}'.format(com, info[1]))
-        
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@ -21,12 +21,12 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
+from paddleaudio import load
+from paddleaudio.features import LogMelSpectrogram

 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram

 __all__ = ['CLSExecutor']

--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@ -22,13 +22,13 @@ from typing import Union

 import paddle
 import soundfile
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from yacs.config import CfgNode

 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification

--- a/paddlespeech/resource/model_alias.py
+++ b/paddlespeech/resource/model_alias.py
@ -22,8 +22,7 @@ model_alias = {
    # -------------- ASR --------------
    # ---------------------------------
    "deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
-    "deepspeech2online":
-    ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
+    "deepspeech2online": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
    "conformer": ["paddlespeech.s2t.models.u2:U2Model"],
    "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"],
    "transformer": ["paddlespeech.s2t.models.u2:U2Model"],
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@ -76,7 +76,8 @@ class CTCPrefixScorePD():
        last_ids = [yi[-1] for yi in y]  # last output label ids
        n_bh = len(last_ids)  # batch * hyps
        n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
-        self.scoring_num = paddle.shape(scoring_ids)[-1] if scoring_ids is not None else 0
+        self.scoring_num = paddle.shape(scoring_ids)[
+            -1] if scoring_ids is not None else 0
        # prepare state info
        if state is None:
            r_prev = paddle.full(
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@ -22,11 +22,9 @@ import numpy as np
 import paddle
 from paddle import distributed as dist
 from paddle import inference
-from paddle.io import DataLoader
-from paddlespeech.s2t.io.dataloader import BatchDataLoader

 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.io.dataset import ManifestDataset
+from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
 from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
@ -238,8 +236,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def __init__(self, config, args):
        super().__init__(config, args)
        self._text_featurizer = TextFeaturizer(
-            unit_type=config.unit_type,
-            vocab=config.vocab_filepath)
+            unit_type=config.unit_type, vocab=config.vocab_filepath)
        self.vocab_list = self._text_featurizer.vocab_list

    def ordid2token(self, texts, texts_len):
@ -248,7 +245,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        for text, n in zip(texts, texts_len):
            n = n.numpy().item()
            ids = text[:n]
-            trans.append(self._text_featurizer.defeaturize(ids.numpy().tolist()))
+            trans.append(
+                self._text_featurizer.defeaturize(ids.numpy().tolist()))
        return trans

    def compute_metrics(self,
--- a/paddlespeech/s2t/models/ds2/init.py
+++ b/paddlespeech/s2t/models/ds2/init.py
@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
+
 from .deepspeech2 import DeepSpeech2InferModel
 from .deepspeech2 import DeepSpeech2Model
 from paddlespeech.s2t.utils import dynamic_pip_install
-import sys

 try:
    import paddlespeech_ctcdecoders
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
@ -372,11 +372,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

-    def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box=None,
+    def forward(self,
+                audio_chunk,
+                audio_chunk_lens,
+                chunk_state_h_box=None,
                chunk_state_c_box=None):
        if self.encoder.rnn_direction == "forward":
            eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
-                audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box)
+                audio_chunk, audio_chunk_lens, chunk_state_h_box,
+                chunk_state_c_box)
            probs_chunk = self.decoder.softmax(eouts_chunk)
            return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
        elif self.encoder.rnn_direction == "bidirect":
@ -392,8 +396,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
                self,
                input_spec=[
                    paddle.static.InputSpec(
-                        shape=[None, None,
-                               self.encoder.feat_size],  #[B, chunk_size, feat_dim]
+                        shape=[None, None, self.encoder.feat_size
+                               ],  #[B, chunk_size, feat_dim]
                        dtype='float32'),
                    paddle.static.InputSpec(shape=[None],
                                            dtype='int64'),  # audio_length, [B]
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):

    def _target_mask(self, ys_in_pad):
        ys_mask = ys_in_pad != 0
-        m = subsequent_mask(paddle.shape(ys_mask)[-1])).unsqueeze(0)
+        m = subsequent_mask(paddle.shape(ys_mask)[-1]).unsqueeze(0)
        return ys_mask.unsqueeze(-2) & m

    def forward(self, x: paddle.Tensor, t: paddle.Tensor
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from contextlib import nullcontext

 import paddle
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import sys
 from typing import Union

 import paddle
@ -22,7 +23,6 @@ from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.loss import CTCLoss
 from paddlespeech.s2t.utils import ctc_utils
 from paddlespeech.s2t.utils.log import Log
-import sys

 logger = Log(__name__).getlog()

--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@ -82,7 +82,8 @@ def pad_sequence(sequences: List[paddle.Tensor],
    max_size = paddle.shape(sequences[0])
    # (TODO Hui Zhang): slice not supprot `end==start`
    # trailing_dims = max_size[1:]
-    trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
+    trailing_dims = tuple(
+        max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
    max_len = max([s.shape[0] for s in sequences])
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@ -55,7 +55,7 @@ class PaddleASRConnectionHanddler:
        self.config = asr_engine.config  # server config
        self.model_config = asr_engine.executor.config
        self.asr_engine = asr_engine
-    
+
        # model_type, sample_rate and text_feature is shared for deepspeech2 and conformer
        self.model_type = self.asr_engine.executor.model_type
        self.sample_rate = self.asr_engine.executor.sample_rate
@ -191,7 +191,7 @@ class PaddleASRConnectionHanddler:
        self.num_frames = 0

        ## endpoint
-        self.endpoint_state = False # True for detect endpoint
+        self.endpoint_state = False  # True for detect endpoint

        ## conformer
        self.model_reset()
@ -503,11 +503,13 @@ class PaddleASRConnectionHanddler:

        # endpoint
        if not is_finished:
+
            def contain_nonsilence():
                return len(self.hyps) > 0 and len(self.hyps[0]) > 0

            decoding_something = contain_nonsilence()
-            if self.endpointer.endpoint_detected(ctc_probs.numpy(), decoding_something):
+            if self.endpointer.endpoint_detected(ctc_probs.numpy(),
+                                                 decoding_something):
                self.endpoint_state = True
                logger.info(f"Endpoint is detected at {self.num_frames} frame.")

@ -869,7 +871,6 @@ class ASREngine(BaseEngine):
        logger.info("Initialize ASR server engine successfully.")
        return True

-
    def new_handler(self):
        """New handler from model.

--- a/paddlespeech/server/engine/asr/online/ctc_endpoint.py
+++ b/paddlespeech/server/engine/asr/online/ctc_endpoint.py
@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import List
+
 import numpy as np

 from paddlespeech.cli.log import logger
@ -76,12 +76,11 @@ class OnlineCTCEndpoint:
            decoding_something or (not rule.must_contain_nonsilence)
        ) and trailine_silence >= rule.min_trailing_silence and utterance_length >= rule.min_utterance_length
        if (ans):
-            logger.info(
-                f"Endpoint Rule: {rule_name} activated: {rule}"
-            )
+            logger.info(f"Endpoint Rule: {rule_name} activated: {rule}")
        return ans

-    def endpoint_detected(self, ctc_log_probs: np.ndarray,
+    def endpoint_detected(self,
+                          ctc_log_probs: np.ndarray,
                          decoding_something: bool) -> bool:
        """detect endpoint.

--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@ -42,7 +42,6 @@ class TTSServerExecutor(TTSExecutor):
        self.task_resource = CommonTaskResource(
            task='tts', model_format='dynamic', inference_mode='online')

-
    def get_model_info(self,
                       field: str,
                       model_name: str,
--- a/paddlespeech/server/ws/asr_api.py
+++ b/paddlespeech/server/ws/asr_api.py
@ -19,7 +19,6 @@ from fastapi import WebSocketDisconnect
 from starlette.websockets import WebSocketState as WebSocketState

 from paddlespeech.cli.log import logger
-from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler
 from paddlespeech.server.engine.engine_pool import get_engine_pool
 router = APIRouter()

@ -106,7 +105,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    logger.info("endpoint: detected and rescoring.")
                    connection_handler.rescoring()
                    word_time_stamp = connection_handler.get_word_time_stamp()
-                
+
                asr_results = connection_handler.get_result()

                if connection_handler.endpoint_state:
@ -124,7 +123,7 @@ async def websocket_endpoint(websocket: WebSocket):
                        }
                        await websocket.send_json(resp)
                        break
- 
+
                # return the current partial result
                # if the engine create the vad instance, this connection will have many partial results 
                resp = {'result': asr_results}
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@ -140,10 +140,7 @@ def parse_args():
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
-        '--am_config',
-        type=str,
-        default=None,
-        help='Config of acoustic model.')
+        '--am_config', type=str, default=None, help='Config of acoustic model.')
    parser.add_argument(
        '--am_ckpt',
        type=str,
@ -179,10 +176,7 @@ def parse_args():
        ],
        help='Choose vocoder type of tts task.')
    parser.add_argument(
-        '--voc_config',
-        type=str,
-        default=None,
-        help='Config of voc.')
+        '--voc_config', type=str, default=None, help='Config of voc.')
    parser.add_argument(
        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
    parser.add_argument(
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@ -174,10 +174,7 @@ def parse_args():
        ],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
-        '--am_config',
-        type=str,
-        default=None,
-        help='Config of acoustic model.')
+        '--am_config', type=str, default=None, help='Config of acoustic model.')
    parser.add_argument(
        '--am_ckpt',
        type=str,
@ -220,10 +217,7 @@ def parse_args():
        ],
        help='Choose vocoder type of tts task.')
    parser.add_argument(
-        '--voc_config',
-        type=str,
-        default=None,
-        help='Config of voc.')
+        '--voc_config', type=str, default=None, help='Config of voc.')
    parser.add_argument(
        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
    parser.add_argument(
--- a/paddlespeech/t2s/exps/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@ -131,10 +131,7 @@ def parse_args():
        choices=['fastspeech2_aishell3', 'tacotron2_aishell3'],
        help='Choose acoustic model type of tts task.')
    parser.add_argument(
-        '--am_config',
-        type=str,
-        default=None,
-        help='Config of acoustic model.')
+        '--am_config', type=str, default=None, help='Config of acoustic model.')
    parser.add_argument(
        '--am_ckpt',
        type=str,
@ -160,10 +157,7 @@ def parse_args():
        help='Choose vocoder type of tts task.')

    parser.add_argument(
-        '--voc_config',
-        type=str,
-        default=None,
-        help='Config of voc.')
+        '--voc_config', type=str, default=None, help='Config of voc.')
    parser.add_argument(
        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
    parser.add_argument(
--- a/paddlespeech/t2s/models/vits/init.py
+++ b/paddlespeech/t2s/models/vits/init.py
@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .vits import *
-from .vits_updater import *
+from .vits_updater import *
--- a/paddlespeech/t2s/models/vits/vits_updater.py
+++ b/paddlespeech/t2s/models/vits/vits_updater.py
@ -56,7 +56,8 @@ class VITSUpdater(StandardUpdater):
        self.models: Dict[str, Layer] = models
        # self.model = model

-        self.model = model._layers if isinstance(model, paddle.DataParallel) else model
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model

        self.optimizers = optimizers
        self.optimizer_g: Optimizer = optimizers['generator']
@ -225,7 +226,8 @@ class VITSEvaluator(StandardEvaluator):
        models = {"main": model}
        self.models: Dict[str, Layer] = models
        # self.model = model
-        self.model = model._layers if isinstance(model, paddle.DataParallel) else model
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model

        self.criterions = criterions
        self.criterion_mel = criterions['mel']
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@ -971,18 +971,18 @@ class FeatureMatchLoss(nn.Layer):

        return feat_match_loss

+
 # loss for VITS
 class KLDivergenceLoss(nn.Layer):
    """KL divergence loss."""

    def forward(
-        self,
-        z_p: paddle.Tensor,
-        logs_q: paddle.Tensor,
-        m_p: paddle.Tensor,
-        logs_p: paddle.Tensor,
-        z_mask: paddle.Tensor,
-    ) -> paddle.Tensor:
+            self,
+            z_p: paddle.Tensor,
+            logs_q: paddle.Tensor,
+            m_p: paddle.Tensor,
+            logs_p: paddle.Tensor,
+            z_mask: paddle.Tensor, ) -> paddle.Tensor:
        """Calculate KL divergence loss.

        Args:
@ -1002,8 +1002,8 @@ class KLDivergenceLoss(nn.Layer):
        logs_p = paddle.cast(logs_p, 'float32')
        z_mask = paddle.cast(z_mask, 'float32')
        kl = logs_p - logs_q - 0.5
-        kl += 0.5 * ((z_p - m_p) ** 2) * paddle.exp(-2.0 * logs_p)
+        kl += 0.5 * ((z_p - m_p)**2) * paddle.exp(-2.0 * logs_p)
        kl = paddle.sum(kl * z_mask)
        loss = kl / paddle.sum(z_mask)

-        return loss
+        return loss
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@ -25,4 +25,3 @@ netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host
 > Reminder: Only for developer, make sure you know what's it.

 * codelab - for speechx developer, using for test.
-
--- a/speechx/examples/ds2_ol/README.md
+++ b/speechx/examples/ds2_ol/README.md
@ -3,4 +3,4 @@
 ## Examples

 * `websocket` - Streaming ASR with websocket for deepspeech2_aishell.    
-* `aishell` - Streaming Decoding under aishell dataset, for local WER test.    
+* `aishell` - Streaming Decoding under aishell dataset, for local WER test.    
--- a/speechx/speechx/codelab/README.md
+++ b/speechx/speechx/codelab/README.md
@ -4,4 +4,3 @@
 > Reminder: Only for developer.

 * codelab - for speechx developer, using for test.
-
--- a/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+++ b/speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
@ -91,8 +91,8 @@ int main(int argc, char* argv[]) {
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data));

-     int32 chunk_size = FLAGS_receptive_field_length
-        + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
+    int32 chunk_size = FLAGS_receptive_field_length +
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
    int32 receptive_field_length = FLAGS_receptive_field_length;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
--- a/speechx/speechx/decoder/ctc_tlg_decoder.cc
+++ b/speechx/speechx/decoder/ctc_tlg_decoder.cc
@ -64,7 +64,7 @@ std::string TLGDecoder::GetPartialResult() {
        std::string word = word_symbol_table_->Find(words_id[idx]);
        words += word;
    }
-    return words; 
+    return words;
 }

 std::string TLGDecoder::GetFinalBestPath() {
--- a/speechx/speechx/decoder/param.h
+++ b/speechx/speechx/decoder/param.h
@ -82,7 +82,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
    opts.assembler_opts.subsampling_rate = FLAGS_downsampling_rate;
    opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length;
    opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
-    
+
    return opts;
 }

--- a/speechx/speechx/decoder/tlg_decoder_main.cc
+++ b/speechx/speechx/decoder/tlg_decoder_main.cc
@ -93,8 +93,8 @@ int main(int argc, char* argv[]) {
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));

-    int32 chunk_size = FLAGS_receptive_field_length
-        + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
+    int32 chunk_size = FLAGS_receptive_field_length +
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
    int32 receptive_field_length = FLAGS_receptive_field_length;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
--- a/speechx/speechx/frontend/audio/assembler.cc
+++ b/speechx/speechx/frontend/audio/assembler.cc
@ -24,7 +24,8 @@ using std::unique_ptr;
 Assembler::Assembler(AssemblerOptions opts,
                     unique_ptr<FrontendInterface> base_extractor) {
    frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk;
-    frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate + opts.receptive_filed_length;
+    frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate +
+                        opts.receptive_filed_length;
    receptive_filed_length_ = opts.receptive_filed_length;
    base_extractor_ = std::move(base_extractor);
    dim_ = base_extractor_->Dim();
@ -50,8 +51,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
        Vector<BaseFloat> feature;
        result = base_extractor_->Read(&feature);
        if (result == false || feature.Dim() == 0) {
-          if (IsFinished() == false) return false;
-          break;
+            if (IsFinished() == false) return false;
+            break;
        }
        feature_cache_.push(feature);
    }
@ -61,22 +62,22 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
    }

    while (feature_cache_.size() < frame_chunk_size_) {
-       Vector<BaseFloat> feature(dim_, kaldi::kSetZero); 
-       feature_cache_.push(feature);
+        Vector<BaseFloat> feature(dim_, kaldi::kSetZero);
+        feature_cache_.push(feature);
    }

-    int32 counter = 0; 
+    int32 counter = 0;
    int32 cache_size = frame_chunk_size_ - frame_chunk_stride_;
    int32 elem_dim = base_extractor_->Dim();
    while (counter < frame_chunk_size_) {
-      Vector<BaseFloat>& val = feature_cache_.front();
-      int32 start = counter * elem_dim;
-      feats->Range(start, elem_dim).CopyFromVec(val);
-      if (frame_chunk_size_ - counter <= cache_size ) {
-          feature_cache_.push(val);
-      }
-      feature_cache_.pop();
-      counter++;
+        Vector<BaseFloat>& val = feature_cache_.front();
+        int32 start = counter * elem_dim;
+        feats->Range(start, elem_dim).CopyFromVec(val);
+        if (frame_chunk_size_ - counter <= cache_size) {
+            feature_cache_.push(val);
+        }
+        feature_cache_.pop();
+        counter++;
    }

    return result;
--- a/speechx/speechx/frontend/audio/assembler.h
+++ b/speechx/speechx/frontend/audio/assembler.h
@ -25,7 +25,7 @@ struct AssemblerOptions {
    int32 receptive_filed_length;
    int32 subsampling_rate;
    int32 nnet_decoder_chunk;
-    
+
    AssemblerOptions()
        : receptive_filed_length(1),
          subsampling_rate(1),
@ -47,15 +47,11 @@ class Assembler : public FrontendInterface {
    // feat dim
    virtual size_t Dim() const { return dim_; }

-    virtual void SetFinished() {
-        base_extractor_->SetFinished();
-    }
+    virtual void SetFinished() { base_extractor_->SetFinished(); }

    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }

-    virtual void Reset() {
-        base_extractor_->Reset();
-    }
+    virtual void Reset() { base_extractor_->Reset(); }

  private:
    bool Compute(kaldi::Vector<kaldi::BaseFloat>* feats);
--- a/speechx/speechx/frontend/audio/audio_cache.h
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@ -30,7 +30,7 @@ class AudioCache : public FrontendInterface {

    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);

-    // the audio dim is 1, one sample, which is useless, 
+    // the audio dim is 1, one sample, which is useless,
    // so we return size_(cache samples) instead.
    virtual size_t Dim() const { return size_; }

--- a/speechx/speechx/frontend/audio/fbank.cc
+++ b/speechx/speechx/frontend/audio/fbank.cc
@ -29,19 +29,19 @@ using kaldi::Matrix;
 using std::vector;

 FbankComputer::FbankComputer(const Options& opts)
-    : opts_(opts),
-    computer_(opts) {}
+    : opts_(opts), computer_(opts) {}

 int32 FbankComputer::Dim() const {
    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
 }

 bool FbankComputer::NeedRawLogEnergy() {
-    return opts_.use_energy && opts_.raw_energy; 
+    return opts_.use_energy && opts_.raw_energy;
 }

 // Compute feat
-bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) {
+bool FbankComputer::Compute(Vector<BaseFloat>* window,
+                            Vector<BaseFloat>* feat) {
    RealFft(window, true);
    kaldi::ComputePowerSpectrum(window);
    const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));
--- a/speechx/speechx/frontend/audio/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@ -72,9 +72,9 @@ bool FeatureCache::Compute() {
    bool result = base_extractor_->Read(&feature);
    if (result == false || feature.Dim() == 0) return false;

-    int32 num_chunk = feature.Dim() / dim_ ;
+    int32 num_chunk = feature.Dim() / dim_;
    for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
-        int32 start = chunk_idx *  dim_;
+        int32 start = chunk_idx * dim_;
        Vector<BaseFloat> feature_chunk(dim_);
        SubVector<BaseFloat> tmp(feature.Data() + start, dim_);
        feature_chunk.CopyFromVec(tmp);
--- a/speechx/speechx/frontend/audio/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@ -22,9 +22,7 @@ namespace ppspeech {
 struct FeatureCacheOptions {
    int32 max_size;
    int32 timeout;  // ms
-    FeatureCacheOptions()
-        : max_size(kint16max),
-          timeout(1) {}
+    FeatureCacheOptions() : max_size(kint16max), timeout(1) {}
 };

 class FeatureCache : public FrontendInterface {
--- a/speechx/speechx/frontend/audio/feature_common.h
+++ b/speechx/speechx/frontend/audio/feature_common.h
@ -23,11 +23,11 @@ template <class F>
 class StreamingFeatureTpl : public FrontendInterface {
  public:
    typedef typename F::Options Options;
-    StreamingFeatureTpl(const Options& opts, 
+    StreamingFeatureTpl(const Options& opts,
                        std::unique_ptr<FrontendInterface> base_extractor);
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
-    
+
    // the dim_ is the dim of single frame feature
    virtual size_t Dim() const { return computer_.Dim(); }

@ -39,8 +39,9 @@ class StreamingFeatureTpl : public FrontendInterface {
        base_extractor_->Reset();
        remained_wav_.Resize(0);
    }
+
  private:
-    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, 
+    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
                 kaldi::Vector<kaldi::BaseFloat>* feats);
    Options opts_;
    std::unique_ptr<FrontendInterface> base_extractor_;
--- a/speechx/speechx/frontend/audio/feature_common_inl.h
+++ b/speechx/speechx/frontend/audio/feature_common_inl.h
@ -16,16 +16,15 @@
 namespace ppspeech {

 template <class F>
-StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts, 
-                        std::unique_ptr<FrontendInterface> base_extractor):
-                        opts_(opts),
-                        computer_(opts),
-                        window_function_(opts.frame_opts) {
+StreamingFeatureTpl<F>::StreamingFeatureTpl(
+    const Options& opts, std::unique_ptr<FrontendInterface> base_extractor)
+    : opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
    base_extractor_ = std::move(base_extractor);
 }

 template <class F>
-void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
+void StreamingFeatureTpl<F>::Accept(
+    const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
    base_extractor_->Accept(waves);
 }

@ -58,8 +57,9 @@ bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {

 // Compute feat
 template <class F>
-bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
-                                     kaldi::Vector<kaldi::BaseFloat>* feats) {
+bool StreamingFeatureTpl<F>::Compute(
+    const kaldi::Vector<kaldi::BaseFloat>& waves,
+    kaldi::Vector<kaldi::BaseFloat>* feats) {
    const kaldi::FrameExtractionOptions& frame_opts =
        computer_.GetFrameOptions();
    int32 num_samples = waves.Dim();
@ -84,9 +84,11 @@ bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& wave
                             &window,
                             need_raw_log_energy ? &raw_log_energy : NULL);

-        kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined);
+        kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(),
+                                                     kaldi::kUndefined);
        computer_.Compute(&window, &this_feature);
-        kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim());
+        kaldi::SubVector<kaldi::BaseFloat> output_row(
+            feats->Data() + frame * Dim(), Dim());
        output_row.CopyFromVec(this_feature);
    }
    return true;
--- a/speechx/speechx/frontend/audio/feature_pipeline.h
+++ b/speechx/speechx/frontend/audio/feature_pipeline.h
@ -16,6 +16,7 @@

 #pragma once

+#include "frontend/audio/assembler.h"
 #include "frontend/audio/audio_cache.h"
 #include "frontend/audio/data_cache.h"
 #include "frontend/audio/fbank.h"
@ -23,7 +24,6 @@
 #include "frontend/audio/frontend_itf.h"
 #include "frontend/audio/linear_spectrogram.h"
 #include "frontend/audio/normalizer.h"
-#include "frontend/audio/assembler.h"

 namespace ppspeech {

--- a/speechx/speechx/frontend/audio/linear_spectrogram.cc
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc
@ -28,22 +28,21 @@ using kaldi::VectorBase;
 using kaldi::Matrix;
 using std::vector;

-LinearSpectrogramComputer::LinearSpectrogramComputer(
-    const Options& opts)
+LinearSpectrogramComputer::LinearSpectrogramComputer(const Options& opts)
    : opts_(opts) {
    kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts);
    int32 window_size = opts.frame_opts.WindowSize();
    frame_length_ = window_size;
    dim_ = window_size / 2 + 1;
-    BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window,
-                                          feature_window_function.window);
+    BaseFloat hanning_window_energy = kaldi::VecVec(
+        feature_window_function.window, feature_window_function.window);
    int32 sample_rate = opts.frame_opts.samp_freq;
    scale_ = 2.0 / (hanning_window_energy * sample_rate);
 }

 // Compute spectrogram feat
 bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window,
-                                Vector<BaseFloat>* feat) {
+                                        Vector<BaseFloat>* feat) {
    window->Resize(frame_length_, kaldi::kCopyData);
    RealFft(window, true);
    kaldi::ComputePowerSpectrum(window);
--- a/speechx/speechx/nnet/nnet_forward_main.cc
+++ b/speechx/speechx/nnet/nnet_forward_main.cc
@ -14,8 +14,8 @@

 #include "base/flags.h"
 #include "base/log.h"
-#include "frontend/audio/data_cache.h"
 #include "frontend/audio/assembler.h"
+#include "frontend/audio/data_cache.h"
 #include "kaldi/util/table-types.h"
 #include "nnet/decodable.h"
 #include "nnet/paddle_nnet.h"
@ -75,8 +75,8 @@ int main(int argc, char* argv[]) {
    std::shared_ptr<ppspeech::Decodable> decodable(
        new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));

-    int32 chunk_size = FLAGS_receptive_field_length 
-        + (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
+    int32 chunk_size = FLAGS_receptive_field_length +
+                       (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
    int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
    int32 receptive_field_length = FLAGS_receptive_field_length;
    LOG(INFO) << "chunk size (frame): " << chunk_size;
@ -130,7 +130,9 @@ int main(int argc, char* argv[]) {
            vector<kaldi::BaseFloat> prob;
            while (decodable->FrameLikelihood(frame_idx, &prob)) {
                kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
-                std::memcpy(vec_tmp.Data(), prob.data(), sizeof(kaldi::BaseFloat)*prob.size());
+                std::memcpy(vec_tmp.Data(),
+                            prob.data(),
+                            sizeof(kaldi::BaseFloat) * prob.size());
                prob_vec.push_back(vec_tmp);
                frame_idx++;
            }
@ -142,7 +144,8 @@ int main(int argc, char* argv[]) {
            KALDI_LOG << " the nnet prob of " << utt << " is empty";
            continue;
        }
-        kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),prob_vec[0].Dim());
+        kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),
+                                               prob_vec[0].Dim());
        for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
            for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) {
                result(row_idx, col_idx) = prob_vec[row_idx](col_idx);
--- a/speechx/speechx/protocol/websocket/websocket_client.h
+++ b/speechx/speechx/protocol/websocket/websocket_client.h
@ -40,8 +40,8 @@ class WebSocketClient {
    void SendEndSignal();
    void SendDataEnd();
    bool Done() const { return done_; }
-    std::string GetResult() const { return result_; } 
-    std::string GetPartialResult() const { return partial_result_;}
+    std::string GetResult() const { return result_; }
+    std::string GetPartialResult() const { return partial_result_; }

  private:
    void Connect();
--- a/speechx/speechx/protocol/websocket/websocket_server.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server.cc
@ -76,9 +76,10 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
    recognizer_->Accept(pcm_data);

    std::string partial_result = recognizer_->GetPartialResult();
-    
-    json::value rv = {
-        {"status", "ok"}, {"type", "partial_result"}, {"result", partial_result}};
+
+    json::value rv = {{"status", "ok"},
+                      {"type", "partial_result"},
+                      {"result", partial_result}};
    ws_.text(true);
    ws_.write(asio::buffer(json::serialize(rv)));
 }