fix ; and format

pull/2015/head
Hui Zhang 3 years ago
parent 69a6da4c16
commit dfdf450b22

@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse import argparse
import json import json
import os import os

@ -14,7 +14,3 @@
import _locale import _locale
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8']) _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])

@ -145,4 +145,3 @@ for com, info in _commands.items():
name='paddlespeech.{}'.format(com), name='paddlespeech.{}'.format(com),
description=info[0], description=info[0],
cls='paddlespeech.cli.{}.{}'.format(com, info[1])) cls='paddlespeech.cli.{}.{}'.format(com, info[1]))

@ -21,12 +21,12 @@ from typing import Union
import numpy as np import numpy as np
import paddle import paddle
import yaml import yaml
from paddleaudio import load
from paddleaudio.features import LogMelSpectrogram
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import stats_wrapper from ..utils import stats_wrapper
from paddleaudio import load
from paddleaudio.features import LogMelSpectrogram
__all__ = ['CLSExecutor'] __all__ = ['CLSExecutor']

@ -22,13 +22,13 @@ from typing import Union
import paddle import paddle
import soundfile import soundfile
from paddleaudio.backends import load as load_audio
from paddleaudio.compliance.librosa import melspectrogram
from yacs.config import CfgNode from yacs.config import CfgNode
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import stats_wrapper from ..utils import stats_wrapper
from paddleaudio.backends import load as load_audio
from paddleaudio.compliance.librosa import melspectrogram
from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.vector.modules.sid_model import SpeakerIdetification from paddlespeech.vector.modules.sid_model import SpeakerIdetification

@ -22,8 +22,7 @@ model_alias = {
# -------------- ASR -------------- # -------------- ASR --------------
# --------------------------------- # ---------------------------------
"deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"], "deepspeech2offline": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
"deepspeech2online": "deepspeech2online": ["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
["paddlespeech.s2t.models.ds2:DeepSpeech2Model"],
"conformer": ["paddlespeech.s2t.models.u2:U2Model"], "conformer": ["paddlespeech.s2t.models.u2:U2Model"],
"conformer_online": ["paddlespeech.s2t.models.u2:U2Model"], "conformer_online": ["paddlespeech.s2t.models.u2:U2Model"],
"transformer": ["paddlespeech.s2t.models.u2:U2Model"], "transformer": ["paddlespeech.s2t.models.u2:U2Model"],

@ -76,7 +76,8 @@ class CTCPrefixScorePD():
last_ids = [yi[-1] for yi in y] # last output label ids last_ids = [yi[-1] for yi in y] # last output label ids
n_bh = len(last_ids) # batch * hyps n_bh = len(last_ids) # batch * hyps
n_hyps = n_bh // self.batch # assuming each utterance has the same # of hyps n_hyps = n_bh // self.batch # assuming each utterance has the same # of hyps
self.scoring_num = paddle.shape(scoring_ids)[-1] if scoring_ids is not None else 0 self.scoring_num = paddle.shape(scoring_ids)[
-1] if scoring_ids is not None else 0
# prepare state info # prepare state info
if state is None: if state is None:
r_prev = paddle.full( r_prev = paddle.full(

@ -22,11 +22,9 @@ import numpy as np
import paddle import paddle
from paddle import distributed as dist from paddle import distributed as dist
from paddle import inference from paddle import inference
from paddle.io import DataLoader
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
from paddlespeech.s2t.models.ds2 import DeepSpeech2Model from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
@ -238,8 +236,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def __init__(self, config, args): def __init__(self, config, args):
super().__init__(config, args) super().__init__(config, args)
self._text_featurizer = TextFeaturizer( self._text_featurizer = TextFeaturizer(
unit_type=config.unit_type, unit_type=config.unit_type, vocab=config.vocab_filepath)
vocab=config.vocab_filepath)
self.vocab_list = self._text_featurizer.vocab_list self.vocab_list = self._text_featurizer.vocab_list
def ordid2token(self, texts, texts_len): def ordid2token(self, texts, texts_len):
@ -248,7 +245,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
for text, n in zip(texts, texts_len): for text, n in zip(texts, texts_len):
n = n.numpy().item() n = n.numpy().item()
ids = text[:n] ids = text[:n]
trans.append(self._text_featurizer.defeaturize(ids.numpy().tolist())) trans.append(
self._text_featurizer.defeaturize(ids.numpy().tolist()))
return trans return trans
def compute_metrics(self, def compute_metrics(self,

@ -11,10 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
from .deepspeech2 import DeepSpeech2InferModel from .deepspeech2 import DeepSpeech2InferModel
from .deepspeech2 import DeepSpeech2Model from .deepspeech2 import DeepSpeech2Model
from paddlespeech.s2t.utils import dynamic_pip_install from paddlespeech.s2t.utils import dynamic_pip_install
import sys
try: try:
import paddlespeech_ctcdecoders import paddlespeech_ctcdecoders

@ -372,11 +372,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
def forward(self, audio_chunk, audio_chunk_lens, chunk_state_h_box=None, def forward(self,
audio_chunk,
audio_chunk_lens,
chunk_state_h_box=None,
chunk_state_c_box=None): chunk_state_c_box=None):
if self.encoder.rnn_direction == "forward": if self.encoder.rnn_direction == "forward":
eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder( eouts_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box = self.encoder(
audio_chunk, audio_chunk_lens, chunk_state_h_box, chunk_state_c_box) audio_chunk, audio_chunk_lens, chunk_state_h_box,
chunk_state_c_box)
probs_chunk = self.decoder.softmax(eouts_chunk) probs_chunk = self.decoder.softmax(eouts_chunk)
return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box return probs_chunk, eouts_chunk_lens, final_state_h_box, final_state_c_box
elif self.encoder.rnn_direction == "bidirect": elif self.encoder.rnn_direction == "bidirect":
@ -392,8 +396,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
self, self,
input_spec=[ input_spec=[
paddle.static.InputSpec( paddle.static.InputSpec(
shape=[None, None, shape=[None, None, self.encoder.feat_size
self.encoder.feat_size], #[B, chunk_size, feat_dim] ], #[B, chunk_size, feat_dim]
dtype='float32'), dtype='float32'),
paddle.static.InputSpec(shape=[None], paddle.static.InputSpec(shape=[None],
dtype='int64'), # audio_length, [B] dtype='int64'), # audio_length, [B]

@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
def _target_mask(self, ys_in_pad): def _target_mask(self, ys_in_pad):
ys_mask = ys_in_pad != 0 ys_mask = ys_in_pad != 0
m = subsequent_mask(paddle.shape(ys_mask)[-1])).unsqueeze(0) m = subsequent_mask(paddle.shape(ys_mask)[-1]).unsqueeze(0)
return ys_mask.unsqueeze(-2) & m return ys_mask.unsqueeze(-2) & m
def forward(self, x: paddle.Tensor, t: paddle.Tensor def forward(self, x: paddle.Tensor, t: paddle.Tensor

@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from contextlib import nullcontext from contextlib import nullcontext
import paddle import paddle

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
from typing import Union from typing import Union
import paddle import paddle
@ -22,7 +23,6 @@ from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.loss import CTCLoss from paddlespeech.s2t.modules.loss import CTCLoss
from paddlespeech.s2t.utils import ctc_utils from paddlespeech.s2t.utils import ctc_utils
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
import sys
logger = Log(__name__).getlog() logger = Log(__name__).getlog()

@ -82,7 +82,8 @@ def pad_sequence(sequences: List[paddle.Tensor],
max_size = paddle.shape(sequences[0]) max_size = paddle.shape(sequences[0])
# (TODO Hui Zhang): slice not supprot `end==start` # (TODO Hui Zhang): slice not supprot `end==start`
# trailing_dims = max_size[1:] # trailing_dims = max_size[1:]
trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else () trailing_dims = tuple(
max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
max_len = max([s.shape[0] for s in sequences]) max_len = max([s.shape[0] for s in sequences])
if batch_first: if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims out_dims = (len(sequences), max_len) + trailing_dims

@ -55,7 +55,7 @@ class PaddleASRConnectionHanddler:
self.config = asr_engine.config # server config self.config = asr_engine.config # server config
self.model_config = asr_engine.executor.config self.model_config = asr_engine.executor.config
self.asr_engine = asr_engine self.asr_engine = asr_engine
# model_type, sample_rate and text_feature is shared for deepspeech2 and conformer # model_type, sample_rate and text_feature is shared for deepspeech2 and conformer
self.model_type = self.asr_engine.executor.model_type self.model_type = self.asr_engine.executor.model_type
self.sample_rate = self.asr_engine.executor.sample_rate self.sample_rate = self.asr_engine.executor.sample_rate
@ -191,7 +191,7 @@ class PaddleASRConnectionHanddler:
self.num_frames = 0 self.num_frames = 0
## endpoint ## endpoint
self.endpoint_state = False # True for detect endpoint self.endpoint_state = False # True for detect endpoint
## conformer ## conformer
self.model_reset() self.model_reset()
@ -503,11 +503,13 @@ class PaddleASRConnectionHanddler:
# endpoint # endpoint
if not is_finished: if not is_finished:
def contain_nonsilence(): def contain_nonsilence():
return len(self.hyps) > 0 and len(self.hyps[0]) > 0 return len(self.hyps) > 0 and len(self.hyps[0]) > 0
decoding_something = contain_nonsilence() decoding_something = contain_nonsilence()
if self.endpointer.endpoint_detected(ctc_probs.numpy(), decoding_something): if self.endpointer.endpoint_detected(ctc_probs.numpy(),
decoding_something):
self.endpoint_state = True self.endpoint_state = True
logger.info(f"Endpoint is detected at {self.num_frames} frame.") logger.info(f"Endpoint is detected at {self.num_frames} frame.")
@ -869,7 +871,6 @@ class ASREngine(BaseEngine):
logger.info("Initialize ASR server engine successfully.") logger.info("Initialize ASR server engine successfully.")
return True return True
def new_handler(self): def new_handler(self):
"""New handler from model. """New handler from model.

@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from dataclasses import dataclass from dataclasses import dataclass
from typing import List
import numpy as np import numpy as np
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
@ -76,12 +76,11 @@ class OnlineCTCEndpoint:
decoding_something or (not rule.must_contain_nonsilence) decoding_something or (not rule.must_contain_nonsilence)
) and trailine_silence >= rule.min_trailing_silence and utterance_length >= rule.min_utterance_length ) and trailine_silence >= rule.min_trailing_silence and utterance_length >= rule.min_utterance_length
if (ans): if (ans):
logger.info( logger.info(f"Endpoint Rule: {rule_name} activated: {rule}")
f"Endpoint Rule: {rule_name} activated: {rule}"
)
return ans return ans
def endpoint_detected(self, ctc_log_probs: np.ndarray, def endpoint_detected(self,
ctc_log_probs: np.ndarray,
decoding_something: bool) -> bool: decoding_something: bool) -> bool:
"""detect endpoint. """detect endpoint.

@ -42,7 +42,6 @@ class TTSServerExecutor(TTSExecutor):
self.task_resource = CommonTaskResource( self.task_resource = CommonTaskResource(
task='tts', model_format='dynamic', inference_mode='online') task='tts', model_format='dynamic', inference_mode='online')
def get_model_info(self, def get_model_info(self,
field: str, field: str,
model_name: str, model_name: str,

@ -19,7 +19,6 @@ from fastapi import WebSocketDisconnect
from starlette.websockets import WebSocketState as WebSocketState from starlette.websockets import WebSocketState as WebSocketState
from paddlespeech.cli.log import logger from paddlespeech.cli.log import logger
from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler
from paddlespeech.server.engine.engine_pool import get_engine_pool from paddlespeech.server.engine.engine_pool import get_engine_pool
router = APIRouter() router = APIRouter()
@ -106,7 +105,7 @@ async def websocket_endpoint(websocket: WebSocket):
logger.info("endpoint: detected and rescoring.") logger.info("endpoint: detected and rescoring.")
connection_handler.rescoring() connection_handler.rescoring()
word_time_stamp = connection_handler.get_word_time_stamp() word_time_stamp = connection_handler.get_word_time_stamp()
asr_results = connection_handler.get_result() asr_results = connection_handler.get_result()
if connection_handler.endpoint_state: if connection_handler.endpoint_state:
@ -124,7 +123,7 @@ async def websocket_endpoint(websocket: WebSocket):
} }
await websocket.send_json(resp) await websocket.send_json(resp)
break break
# return the current partial result # return the current partial result
# if the engine create the vad instance, this connection will have many partial results # if the engine create the vad instance, this connection will have many partial results
resp = {'result': asr_results} resp = {'result': asr_results}

@ -140,10 +140,7 @@ def parse_args():
], ],
help='Choose acoustic model type of tts task.') help='Choose acoustic model type of tts task.')
parser.add_argument( parser.add_argument(
'--am_config', '--am_config', type=str, default=None, help='Config of acoustic model.')
type=str,
default=None,
help='Config of acoustic model.')
parser.add_argument( parser.add_argument(
'--am_ckpt', '--am_ckpt',
type=str, type=str,
@ -179,10 +176,7 @@ def parse_args():
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
parser.add_argument( parser.add_argument(
'--voc_config', '--voc_config', type=str, default=None, help='Config of voc.')
type=str,
default=None,
help='Config of voc.')
parser.add_argument( parser.add_argument(
'--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
parser.add_argument( parser.add_argument(

@ -174,10 +174,7 @@ def parse_args():
], ],
help='Choose acoustic model type of tts task.') help='Choose acoustic model type of tts task.')
parser.add_argument( parser.add_argument(
'--am_config', '--am_config', type=str, default=None, help='Config of acoustic model.')
type=str,
default=None,
help='Config of acoustic model.')
parser.add_argument( parser.add_argument(
'--am_ckpt', '--am_ckpt',
type=str, type=str,
@ -220,10 +217,7 @@ def parse_args():
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
parser.add_argument( parser.add_argument(
'--voc_config', '--voc_config', type=str, default=None, help='Config of voc.')
type=str,
default=None,
help='Config of voc.')
parser.add_argument( parser.add_argument(
'--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
parser.add_argument( parser.add_argument(

@ -131,10 +131,7 @@ def parse_args():
choices=['fastspeech2_aishell3', 'tacotron2_aishell3'], choices=['fastspeech2_aishell3', 'tacotron2_aishell3'],
help='Choose acoustic model type of tts task.') help='Choose acoustic model type of tts task.')
parser.add_argument( parser.add_argument(
'--am_config', '--am_config', type=str, default=None, help='Config of acoustic model.')
type=str,
default=None,
help='Config of acoustic model.')
parser.add_argument( parser.add_argument(
'--am_ckpt', '--am_ckpt',
type=str, type=str,
@ -160,10 +157,7 @@ def parse_args():
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
parser.add_argument( parser.add_argument(
'--voc_config', '--voc_config', type=str, default=None, help='Config of voc.')
type=str,
default=None,
help='Config of voc.')
parser.add_argument( parser.add_argument(
'--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
parser.add_argument( parser.add_argument(

@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .vits import * from .vits import *
from .vits_updater import * from .vits_updater import *

@ -56,7 +56,8 @@ class VITSUpdater(StandardUpdater):
self.models: Dict[str, Layer] = models self.models: Dict[str, Layer] = models
# self.model = model # self.model = model
self.model = model._layers if isinstance(model, paddle.DataParallel) else model self.model = model._layers if isinstance(model,
paddle.DataParallel) else model
self.optimizers = optimizers self.optimizers = optimizers
self.optimizer_g: Optimizer = optimizers['generator'] self.optimizer_g: Optimizer = optimizers['generator']
@ -225,7 +226,8 @@ class VITSEvaluator(StandardEvaluator):
models = {"main": model} models = {"main": model}
self.models: Dict[str, Layer] = models self.models: Dict[str, Layer] = models
# self.model = model # self.model = model
self.model = model._layers if isinstance(model, paddle.DataParallel) else model self.model = model._layers if isinstance(model,
paddle.DataParallel) else model
self.criterions = criterions self.criterions = criterions
self.criterion_mel = criterions['mel'] self.criterion_mel = criterions['mel']

@ -971,18 +971,18 @@ class FeatureMatchLoss(nn.Layer):
return feat_match_loss return feat_match_loss
# loss for VITS # loss for VITS
class KLDivergenceLoss(nn.Layer): class KLDivergenceLoss(nn.Layer):
"""KL divergence loss.""" """KL divergence loss."""
def forward( def forward(
self, self,
z_p: paddle.Tensor, z_p: paddle.Tensor,
logs_q: paddle.Tensor, logs_q: paddle.Tensor,
m_p: paddle.Tensor, m_p: paddle.Tensor,
logs_p: paddle.Tensor, logs_p: paddle.Tensor,
z_mask: paddle.Tensor, z_mask: paddle.Tensor, ) -> paddle.Tensor:
) -> paddle.Tensor:
"""Calculate KL divergence loss. """Calculate KL divergence loss.
Args: Args:
@ -1002,8 +1002,8 @@ class KLDivergenceLoss(nn.Layer):
logs_p = paddle.cast(logs_p, 'float32') logs_p = paddle.cast(logs_p, 'float32')
z_mask = paddle.cast(z_mask, 'float32') z_mask = paddle.cast(z_mask, 'float32')
kl = logs_p - logs_q - 0.5 kl = logs_p - logs_q - 0.5
kl += 0.5 * ((z_p - m_p) ** 2) * paddle.exp(-2.0 * logs_p) kl += 0.5 * ((z_p - m_p)**2) * paddle.exp(-2.0 * logs_p)
kl = paddle.sum(kl * z_mask) kl = paddle.sum(kl * z_mask)
loss = kl / paddle.sum(z_mask) loss = kl / paddle.sum(z_mask)
return loss return loss

@ -25,4 +25,3 @@ netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host
> Reminder: Only for developer, make sure you know what's it. > Reminder: Only for developer, make sure you know what's it.
* codelab - for speechx developer, using for test. * codelab - for speechx developer, using for test.

@ -3,4 +3,4 @@
## Examples ## Examples
* `websocket` - Streaming ASR with websocket for deepspeech2_aishell. * `websocket` - Streaming ASR with websocket for deepspeech2_aishell.
* `aishell` - Streaming Decoding under aishell dataset, for local WER test. * `aishell` - Streaming Decoding under aishell dataset, for local WER test.

@ -4,4 +4,3 @@
> Reminder: Only for developer. > Reminder: Only for developer.
* codelab - for speechx developer, using for test. * codelab - for speechx developer, using for test.

@ -91,8 +91,8 @@ int main(int argc, char* argv[]) {
std::shared_ptr<ppspeech::Decodable> decodable( std::shared_ptr<ppspeech::Decodable> decodable(
new ppspeech::Decodable(nnet, raw_data)); new ppspeech::Decodable(nnet, raw_data));
int32 chunk_size = FLAGS_receptive_field_length int32 chunk_size = FLAGS_receptive_field_length +
+ (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
int32 receptive_field_length = FLAGS_receptive_field_length; int32 receptive_field_length = FLAGS_receptive_field_length;
LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk size (frame): " << chunk_size;

@ -64,7 +64,7 @@ std::string TLGDecoder::GetPartialResult() {
std::string word = word_symbol_table_->Find(words_id[idx]); std::string word = word_symbol_table_->Find(words_id[idx]);
words += word; words += word;
} }
return words; return words;
} }
std::string TLGDecoder::GetFinalBestPath() { std::string TLGDecoder::GetFinalBestPath() {

@ -82,7 +82,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts.assembler_opts.subsampling_rate = FLAGS_downsampling_rate; opts.assembler_opts.subsampling_rate = FLAGS_downsampling_rate;
opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length; opts.assembler_opts.receptive_filed_length = FLAGS_receptive_field_length;
opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk; opts.assembler_opts.nnet_decoder_chunk = FLAGS_nnet_decoder_chunk;
return opts; return opts;
} }

@ -93,8 +93,8 @@ int main(int argc, char* argv[]) {
std::shared_ptr<ppspeech::Decodable> decodable( std::shared_ptr<ppspeech::Decodable> decodable(
new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
int32 chunk_size = FLAGS_receptive_field_length int32 chunk_size = FLAGS_receptive_field_length +
+ (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
int32 receptive_field_length = FLAGS_receptive_field_length; int32 receptive_field_length = FLAGS_receptive_field_length;
LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk size (frame): " << chunk_size;

@ -24,7 +24,8 @@ using std::unique_ptr;
Assembler::Assembler(AssemblerOptions opts, Assembler::Assembler(AssemblerOptions opts,
unique_ptr<FrontendInterface> base_extractor) { unique_ptr<FrontendInterface> base_extractor) {
frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk; frame_chunk_stride_ = opts.subsampling_rate * opts.nnet_decoder_chunk;
frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate + opts.receptive_filed_length; frame_chunk_size_ = (opts.nnet_decoder_chunk - 1) * opts.subsampling_rate +
opts.receptive_filed_length;
receptive_filed_length_ = opts.receptive_filed_length; receptive_filed_length_ = opts.receptive_filed_length;
base_extractor_ = std::move(base_extractor); base_extractor_ = std::move(base_extractor);
dim_ = base_extractor_->Dim(); dim_ = base_extractor_->Dim();
@ -50,8 +51,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
Vector<BaseFloat> feature; Vector<BaseFloat> feature;
result = base_extractor_->Read(&feature); result = base_extractor_->Read(&feature);
if (result == false || feature.Dim() == 0) { if (result == false || feature.Dim() == 0) {
if (IsFinished() == false) return false; if (IsFinished() == false) return false;
break; break;
} }
feature_cache_.push(feature); feature_cache_.push(feature);
} }
@ -61,22 +62,22 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
} }
while (feature_cache_.size() < frame_chunk_size_) { while (feature_cache_.size() < frame_chunk_size_) {
Vector<BaseFloat> feature(dim_, kaldi::kSetZero); Vector<BaseFloat> feature(dim_, kaldi::kSetZero);
feature_cache_.push(feature); feature_cache_.push(feature);
} }
int32 counter = 0; int32 counter = 0;
int32 cache_size = frame_chunk_size_ - frame_chunk_stride_; int32 cache_size = frame_chunk_size_ - frame_chunk_stride_;
int32 elem_dim = base_extractor_->Dim(); int32 elem_dim = base_extractor_->Dim();
while (counter < frame_chunk_size_) { while (counter < frame_chunk_size_) {
Vector<BaseFloat>& val = feature_cache_.front(); Vector<BaseFloat>& val = feature_cache_.front();
int32 start = counter * elem_dim; int32 start = counter * elem_dim;
feats->Range(start, elem_dim).CopyFromVec(val); feats->Range(start, elem_dim).CopyFromVec(val);
if (frame_chunk_size_ - counter <= cache_size ) { if (frame_chunk_size_ - counter <= cache_size) {
feature_cache_.push(val); feature_cache_.push(val);
} }
feature_cache_.pop(); feature_cache_.pop();
counter++; counter++;
} }
return result; return result;

@ -25,7 +25,7 @@ struct AssemblerOptions {
int32 receptive_filed_length; int32 receptive_filed_length;
int32 subsampling_rate; int32 subsampling_rate;
int32 nnet_decoder_chunk; int32 nnet_decoder_chunk;
AssemblerOptions() AssemblerOptions()
: receptive_filed_length(1), : receptive_filed_length(1),
subsampling_rate(1), subsampling_rate(1),
@ -47,15 +47,11 @@ class Assembler : public FrontendInterface {
// feat dim // feat dim
virtual size_t Dim() const { return dim_; } virtual size_t Dim() const { return dim_; }
virtual void SetFinished() { virtual void SetFinished() { base_extractor_->SetFinished(); }
base_extractor_->SetFinished();
}
virtual bool IsFinished() const { return base_extractor_->IsFinished(); } virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
virtual void Reset() { virtual void Reset() { base_extractor_->Reset(); }
base_extractor_->Reset();
}
private: private:
bool Compute(kaldi::Vector<kaldi::BaseFloat>* feats); bool Compute(kaldi::Vector<kaldi::BaseFloat>* feats);

@ -30,7 +30,7 @@ class AudioCache : public FrontendInterface {
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
// the audio dim is 1, one sample, which is useless, // the audio dim is 1, one sample, which is useless,
// so we return size_(cache samples) instead. // so we return size_(cache samples) instead.
virtual size_t Dim() const { return size_; } virtual size_t Dim() const { return size_; }

@ -29,19 +29,19 @@ using kaldi::Matrix;
using std::vector; using std::vector;
FbankComputer::FbankComputer(const Options& opts) FbankComputer::FbankComputer(const Options& opts)
: opts_(opts), : opts_(opts), computer_(opts) {}
computer_(opts) {}
int32 FbankComputer::Dim() const { int32 FbankComputer::Dim() const {
return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0); return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
} }
bool FbankComputer::NeedRawLogEnergy() { bool FbankComputer::NeedRawLogEnergy() {
return opts_.use_energy && opts_.raw_energy; return opts_.use_energy && opts_.raw_energy;
} }
// Compute feat // Compute feat
bool FbankComputer::Compute(Vector<BaseFloat>* window, Vector<BaseFloat>* feat) { bool FbankComputer::Compute(Vector<BaseFloat>* window,
Vector<BaseFloat>* feat) {
RealFft(window, true); RealFft(window, true);
kaldi::ComputePowerSpectrum(window); kaldi::ComputePowerSpectrum(window);
const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0)); const kaldi::MelBanks& mel_bank = *(computer_.GetMelBanks(1.0));

@ -72,9 +72,9 @@ bool FeatureCache::Compute() {
bool result = base_extractor_->Read(&feature); bool result = base_extractor_->Read(&feature);
if (result == false || feature.Dim() == 0) return false; if (result == false || feature.Dim() == 0) return false;
int32 num_chunk = feature.Dim() / dim_ ; int32 num_chunk = feature.Dim() / dim_;
for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) { for (int chunk_idx = 0; chunk_idx < num_chunk; ++chunk_idx) {
int32 start = chunk_idx * dim_; int32 start = chunk_idx * dim_;
Vector<BaseFloat> feature_chunk(dim_); Vector<BaseFloat> feature_chunk(dim_);
SubVector<BaseFloat> tmp(feature.Data() + start, dim_); SubVector<BaseFloat> tmp(feature.Data() + start, dim_);
feature_chunk.CopyFromVec(tmp); feature_chunk.CopyFromVec(tmp);

@ -22,9 +22,7 @@ namespace ppspeech {
struct FeatureCacheOptions { struct FeatureCacheOptions {
int32 max_size; int32 max_size;
int32 timeout; // ms int32 timeout; // ms
FeatureCacheOptions() FeatureCacheOptions() : max_size(kint16max), timeout(1) {}
: max_size(kint16max),
timeout(1) {}
}; };
class FeatureCache : public FrontendInterface { class FeatureCache : public FrontendInterface {

@ -23,11 +23,11 @@ template <class F>
class StreamingFeatureTpl : public FrontendInterface { class StreamingFeatureTpl : public FrontendInterface {
public: public:
typedef typename F::Options Options; typedef typename F::Options Options;
StreamingFeatureTpl(const Options& opts, StreamingFeatureTpl(const Options& opts,
std::unique_ptr<FrontendInterface> base_extractor); std::unique_ptr<FrontendInterface> base_extractor);
virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves); virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats); virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
// the dim_ is the dim of single frame feature // the dim_ is the dim of single frame feature
virtual size_t Dim() const { return computer_.Dim(); } virtual size_t Dim() const { return computer_.Dim(); }
@ -39,8 +39,9 @@ class StreamingFeatureTpl : public FrontendInterface {
base_extractor_->Reset(); base_extractor_->Reset();
remained_wav_.Resize(0); remained_wav_.Resize(0);
} }
private: private:
bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats); kaldi::Vector<kaldi::BaseFloat>* feats);
Options opts_; Options opts_;
std::unique_ptr<FrontendInterface> base_extractor_; std::unique_ptr<FrontendInterface> base_extractor_;

@ -16,16 +16,15 @@
namespace ppspeech { namespace ppspeech {
template <class F> template <class F>
StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts, StreamingFeatureTpl<F>::StreamingFeatureTpl(
std::unique_ptr<FrontendInterface> base_extractor): const Options& opts, std::unique_ptr<FrontendInterface> base_extractor)
opts_(opts), : opts_(opts), computer_(opts), window_function_(opts.frame_opts) {
computer_(opts),
window_function_(opts.frame_opts) {
base_extractor_ = std::move(base_extractor); base_extractor_ = std::move(base_extractor);
} }
template <class F> template <class F>
void StreamingFeatureTpl<F>::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves) { void StreamingFeatureTpl<F>::Accept(
const kaldi::VectorBase<kaldi::BaseFloat>& waves) {
base_extractor_->Accept(waves); base_extractor_->Accept(waves);
} }
@ -58,8 +57,9 @@ bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
// Compute feat // Compute feat
template <class F> template <class F>
bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& waves, bool StreamingFeatureTpl<F>::Compute(
kaldi::Vector<kaldi::BaseFloat>* feats) { const kaldi::Vector<kaldi::BaseFloat>& waves,
kaldi::Vector<kaldi::BaseFloat>* feats) {
const kaldi::FrameExtractionOptions& frame_opts = const kaldi::FrameExtractionOptions& frame_opts =
computer_.GetFrameOptions(); computer_.GetFrameOptions();
int32 num_samples = waves.Dim(); int32 num_samples = waves.Dim();
@ -84,9 +84,11 @@ bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& wave
&window, &window,
need_raw_log_energy ? &raw_log_energy : NULL); need_raw_log_energy ? &raw_log_energy : NULL);
kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(), kaldi::kUndefined); kaldi::Vector<kaldi::BaseFloat> this_feature(computer_.Dim(),
kaldi::kUndefined);
computer_.Compute(&window, &this_feature); computer_.Compute(&window, &this_feature);
kaldi::SubVector<kaldi::BaseFloat> output_row(feats->Data() + frame * Dim(), Dim()); kaldi::SubVector<kaldi::BaseFloat> output_row(
feats->Data() + frame * Dim(), Dim());
output_row.CopyFromVec(this_feature); output_row.CopyFromVec(this_feature);
} }
return true; return true;

@ -16,6 +16,7 @@
#pragma once #pragma once
#include "frontend/audio/assembler.h"
#include "frontend/audio/audio_cache.h" #include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h" #include "frontend/audio/data_cache.h"
#include "frontend/audio/fbank.h" #include "frontend/audio/fbank.h"
@ -23,7 +24,6 @@
#include "frontend/audio/frontend_itf.h" #include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h" #include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/normalizer.h" #include "frontend/audio/normalizer.h"
#include "frontend/audio/assembler.h"
namespace ppspeech { namespace ppspeech {

@ -28,22 +28,21 @@ using kaldi::VectorBase;
using kaldi::Matrix; using kaldi::Matrix;
using std::vector; using std::vector;
LinearSpectrogramComputer::LinearSpectrogramComputer( LinearSpectrogramComputer::LinearSpectrogramComputer(const Options& opts)
const Options& opts)
: opts_(opts) { : opts_(opts) {
kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts); kaldi::FeatureWindowFunction feature_window_function(opts.frame_opts);
int32 window_size = opts.frame_opts.WindowSize(); int32 window_size = opts.frame_opts.WindowSize();
frame_length_ = window_size; frame_length_ = window_size;
dim_ = window_size / 2 + 1; dim_ = window_size / 2 + 1;
BaseFloat hanning_window_energy = kaldi::VecVec(feature_window_function.window, BaseFloat hanning_window_energy = kaldi::VecVec(
feature_window_function.window); feature_window_function.window, feature_window_function.window);
int32 sample_rate = opts.frame_opts.samp_freq; int32 sample_rate = opts.frame_opts.samp_freq;
scale_ = 2.0 / (hanning_window_energy * sample_rate); scale_ = 2.0 / (hanning_window_energy * sample_rate);
} }
// Compute spectrogram feat // Compute spectrogram feat
bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window, bool LinearSpectrogramComputer::Compute(Vector<BaseFloat>* window,
Vector<BaseFloat>* feat) { Vector<BaseFloat>* feat) {
window->Resize(frame_length_, kaldi::kCopyData); window->Resize(frame_length_, kaldi::kCopyData);
RealFft(window, true); RealFft(window, true);
kaldi::ComputePowerSpectrum(window); kaldi::ComputePowerSpectrum(window);

@ -14,8 +14,8 @@
#include "base/flags.h" #include "base/flags.h"
#include "base/log.h" #include "base/log.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/assembler.h" #include "frontend/audio/assembler.h"
#include "frontend/audio/data_cache.h"
#include "kaldi/util/table-types.h" #include "kaldi/util/table-types.h"
#include "nnet/decodable.h" #include "nnet/decodable.h"
#include "nnet/paddle_nnet.h" #include "nnet/paddle_nnet.h"
@ -75,8 +75,8 @@ int main(int argc, char* argv[]) {
std::shared_ptr<ppspeech::Decodable> decodable( std::shared_ptr<ppspeech::Decodable> decodable(
new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale)); new ppspeech::Decodable(nnet, raw_data, FLAGS_acoustic_scale));
int32 chunk_size = FLAGS_receptive_field_length int32 chunk_size = FLAGS_receptive_field_length +
+ (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate; (FLAGS_nnet_decoder_chunk - 1) * FLAGS_downsampling_rate;
int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk; int32 chunk_stride = FLAGS_downsampling_rate * FLAGS_nnet_decoder_chunk;
int32 receptive_field_length = FLAGS_receptive_field_length; int32 receptive_field_length = FLAGS_receptive_field_length;
LOG(INFO) << "chunk size (frame): " << chunk_size; LOG(INFO) << "chunk size (frame): " << chunk_size;
@ -130,7 +130,9 @@ int main(int argc, char* argv[]) {
vector<kaldi::BaseFloat> prob; vector<kaldi::BaseFloat> prob;
while (decodable->FrameLikelihood(frame_idx, &prob)) { while (decodable->FrameLikelihood(frame_idx, &prob)) {
kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size()); kaldi::Vector<kaldi::BaseFloat> vec_tmp(prob.size());
std::memcpy(vec_tmp.Data(), prob.data(), sizeof(kaldi::BaseFloat)*prob.size()); std::memcpy(vec_tmp.Data(),
prob.data(),
sizeof(kaldi::BaseFloat) * prob.size());
prob_vec.push_back(vec_tmp); prob_vec.push_back(vec_tmp);
frame_idx++; frame_idx++;
} }
@ -142,7 +144,8 @@ int main(int argc, char* argv[]) {
KALDI_LOG << " the nnet prob of " << utt << " is empty"; KALDI_LOG << " the nnet prob of " << utt << " is empty";
continue; continue;
} }
kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),prob_vec[0].Dim()); kaldi::Matrix<kaldi::BaseFloat> result(prob_vec.size(),
prob_vec[0].Dim());
for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) { for (int32 row_idx = 0; row_idx < prob_vec.size(); ++row_idx) {
for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) { for (int32 col_idx = 0; col_idx < prob_vec[0].Dim(); ++col_idx) {
result(row_idx, col_idx) = prob_vec[row_idx](col_idx); result(row_idx, col_idx) = prob_vec[row_idx](col_idx);

@ -40,8 +40,8 @@ class WebSocketClient {
void SendEndSignal(); void SendEndSignal();
void SendDataEnd(); void SendDataEnd();
bool Done() const { return done_; } bool Done() const { return done_; }
std::string GetResult() const { return result_; } std::string GetResult() const { return result_; }
std::string GetPartialResult() const { return partial_result_;} std::string GetPartialResult() const { return partial_result_; }
private: private:
void Connect(); void Connect();

@ -76,9 +76,10 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
recognizer_->Accept(pcm_data); recognizer_->Accept(pcm_data);
std::string partial_result = recognizer_->GetPartialResult(); std::string partial_result = recognizer_->GetPartialResult();
json::value rv = { json::value rv = {{"status", "ok"},
{"status", "ok"}, {"type", "partial_result"}, {"result", partial_result}}; {"type", "partial_result"},
{"result", partial_result}};
ws_.text(true); ws_.text(true);
ws_.write(asio::buffer(json::serialize(rv))); ws_.write(asio::buffer(json::serialize(rv)));
} }

Loading…
Cancel
Save