PaddleSpeech/deepspeech/exps/deepspeech2/bin/deploy/runtime.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Server-end for the ASR demo."""
import os
import time
import argparse
import functools
import paddle
import numpy as np

from deepspeech.utils.socket_server import warm_up_test
from deepspeech.utils.socket_server import AsrTCPServer
from deepspeech.utils.socket_server import AsrRequestHandler

from deepspeech.training.cli import default_argument_parser
from deepspeech.exps.deepspeech2.config import get_cfg_defaults

from deepspeech.frontend.utility import read_manifest
from deepspeech.utils.utility import add_arguments, print_arguments

from deepspeech.models.deepspeech2 import DeepSpeech2Model
from deepspeech.io.dataset import ManifestDataset

from paddle.inference import Config
from paddle.inference import create_predictor


def init_predictor(args):
    if args.model_dir is not None:
        config = Config(args.model_dir)
    else:
        config = Config(args.model_file, args.params_file)

    config.enable_memory_optim()
    if args.use_gpu:
        config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
    else:
        # If not specific mkldnn, you can set the blas thread.
        # The thread num should not be greater than the number of cores in the CPU.
        config.set_cpu_math_library_num_threads(4)
        config.enable_mkldnn()

    predictor = create_predictor(config)
    return predictor


def run(predictor, img):
    # copy img data to input tensor
    input_names = predictor.get_input_names()
    for i, name in enumerate(input_names):
        input_tensor = predictor.get_input_handle(name)
        #input_tensor.reshape(img[i].shape)
        #input_tensor.copy_from_cpu(img[i].copy())

    # do the inference
    predictor.run()

    results = []
    # get out data from output tensor
    output_names = predictor.get_output_names()
    for i, name in enumerate(output_names):
        output_tensor = predictor.get_output_handle(name)
        output_data = output_tensor.copy_to_cpu()
        results.append(output_data)

    return results


def inference(config, args):
    predictor = init_predictor(args)


def start_server(config, args):
    """Start the ASR server"""
    dataset = ManifestDataset(
        config.data.test_manifest,
        config.data.vocab_filepath,
        config.data.mean_std_filepath,
        augmentation_config="{}",
        max_duration=config.data.max_duration,
        min_duration=config.data.min_duration,
        stride_ms=config.data.stride_ms,
        window_ms=config.data.window_ms,
        n_fft=config.data.n_fft,
        max_freq=config.data.max_freq,
        target_sample_rate=config.data.target_sample_rate,
        specgram_type=config.data.specgram_type,
        use_dB_normalization=config.data.use_dB_normalization,
        target_dB=config.data.target_dB,
        random_seed=config.data.random_seed,
        keep_transcription_text=True)

    model = DeepSpeech2Model.from_pretrained(dataset, config,
                                             args.checkpoint_path)
    model.eval()

    # prepare ASR inference handler
    def file_to_transcript(filename):
        feature = dataset.process_utterance(filename, "")
        audio = np.array([feature[0]]).astype('float32')  #[1, D, T]
        audio_len = feature[0].shape[1]
        audio_len = np.array([audio_len]).astype('int64')  # [1]

        result_transcript = model.decode(
            paddle.to_tensor(audio),
            paddle.to_tensor(audio_len),
            vocab_list=dataset.vocab_list,
            decoding_method=config.decoding.decoding_method,
            lang_model_path=config.decoding.lang_model_path,
            beam_alpha=config.decoding.alpha,
            beam_beta=config.decoding.beta,
            beam_size=config.decoding.beam_size,
            cutoff_prob=config.decoding.cutoff_prob,
            cutoff_top_n=config.decoding.cutoff_top_n,
            num_processes=config.decoding.num_proc_bsearch)
        return result_transcript[0]

    # warming up with utterrances sampled from Librispeech
    print('-----------------------------------------------------------')
    print('Warming up ...')
    warm_up_test(
        audio_process_handler=file_to_transcript,
        manifest_path=args.warmup_manifest,
        num_test_cases=3)
    print('-----------------------------------------------------------')

    # start the server
    server = AsrTCPServer(
        server_address=(args.host_ip, args.host_port),
        RequestHandlerClass=AsrRequestHandler,
        speech_save_dir=args.speech_save_dir,
        audio_process_handler=file_to_transcript)
    print("ASR Server Started.")
    server.serve_forever()


def main(config, args):
    start_server(config, args)


if __name__ == "__main__":
    parser = default_argument_parser()
    add_arg = functools.partial(add_arguments, argparser=parser)
    # yapf: disable
    add_arg('host_ip',          str,
            'localhost',
            "Server's IP address.")
    add_arg('host_port',        int,    8086,    "Server's IP port.")
    add_arg('speech_save_dir',  str,
            'demo_cache',
            "Directory to save demo audios.")
    add_arg('warmup_manifest',  str, None, "Filepath of manifest to warm up.")
    add_arg(
        "--model_file",
        type=str,
        default="",
        help="Model filename, Specify this when your model is a combined model."
    )
    add_arg(
        "--params_file",
        type=str,
        default="",
        help=
        "Parameter filename, Specify this when your model is a combined model."
    )
    add_arg(
        "--model_dir",
        type=str,
        default=None,
        help=
        "Model dir, If you load a non-combined model, specify the directory of the model."
    )
    add_arg("--use_gpu",
                        type=bool,
                        default=False,
                        help="Whether use gpu.")
    args = parser.parse_args()
    print_arguments(args)

    # https://yaml.org/type/float.html
    config = get_cfg_defaults()
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:
        config.merge_from_list(args.opts)
    config.freeze()
    print(config)

    args.warmup_manifest = config.data.test_manifest
    print_arguments(args)

    if args.dump_config:
        with open(args.dump_config, 'w') as f:
            print(config, file=f)

    main(config, args)
Support paddle 2.x (#538) * 2.x model * model test pass * fix data * fix soundfile with flac support * one thread dataloader test pass * export feasture size add trainer and utils add setup model and dataloader update travis using Bionic dist * add venv; test under venv * fix unittest; train and valid * add train and config * add config and train script * fix ctc cuda memcopy error * fix imports * fix train valid log * fix dataset batch shuffle shift start from 1 fix rank_zero_only decreator error close tensorboard when train over add decoding config and code * test process can run * test with decoding * test and infer with decoding * fix infer * fix ctc loss lr schedule sortagrad logger * aishell egs * refactor train add aishell egs * fix dataset batch shuffle and add batch sampler log print model parameter * fix model and ctc * sequence_mask make all inputs zeros, which cause grad be zero, this is a bug of LessThanOp add grad clip by global norm add model train test notebook * ctc loss remove run prefix using ord value as text id * using unk when training compute_loss need text ids ord id using in test mode, which compute wer/cer * fix tester * add lr_deacy refactor code * fix tools * fix ci add tune fix gru model bugs add dataset and model test * fix decoding * refactor repo fix decoding * fix musan and rir dataset * refactor io, loss, conv, rnn, gradclip, model, utils * fix ci and import * refactor model add export jit model * add deploy bin and test it * rm uselss egs * add layer tools * refactor socket server new model from pretrain * remve useless * fix instability loss and grad nan or inf for librispeech training * fix sampler * fix libri train.sh * fix doc * add license on cpp * fix doc * fix libri script * fix install * clip 5 wer 7.39, clip 400 wer 7.54, 1.8 clip 400 baseline 7.49 4 years ago			`# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.`
			`#`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`"""Server-end for the ASR demo."""`
			`import os`
			`import time`
			`import argparse`
			`import functools`
			`import paddle`
			`import numpy as np`

			`from deepspeech.utils.socket_server import warm_up_test`
			`from deepspeech.utils.socket_server import AsrTCPServer`
			`from deepspeech.utils.socket_server import AsrRequestHandler`

			`from deepspeech.training.cli import default_argument_parser`
			`from deepspeech.exps.deepspeech2.config import get_cfg_defaults`

			`from deepspeech.frontend.utility import read_manifest`
			`from deepspeech.utils.utility import add_arguments, print_arguments`

			`from deepspeech.models.deepspeech2 import DeepSpeech2Model`
			`from deepspeech.io.dataset import ManifestDataset`

			`from paddle.inference import Config`
			`from paddle.inference import create_predictor`


			`def init_predictor(args):`
			`if args.model_dir is not None:`
			`config = Config(args.model_dir)`
			`else:`
			`config = Config(args.model_file, args.params_file)`

			`config.enable_memory_optim()`
			`if args.use_gpu:`
			`config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)`
			`else:`
			`# If not specific mkldnn, you can set the blas thread.`
			`# The thread num should not be greater than the number of cores in the CPU.`
			`config.set_cpu_math_library_num_threads(4)`
			`config.enable_mkldnn()`

			`predictor = create_predictor(config)`
			`return predictor`


			`def run(predictor, img):`
			`# copy img data to input tensor`
			`input_names = predictor.get_input_names()`
			`for i, name in enumerate(input_names):`
			`input_tensor = predictor.get_input_handle(name)`
			`#input_tensor.reshape(img[i].shape)`
			`#input_tensor.copy_from_cpu(img[i].copy())`

			`# do the inference`
			`predictor.run()`

			`results = []`
			`# get out data from output tensor`
			`output_names = predictor.get_output_names()`
			`for i, name in enumerate(output_names):`
			`output_tensor = predictor.get_output_handle(name)`
			`output_data = output_tensor.copy_to_cpu()`
			`results.append(output_data)`

			`return results`


			`def inference(config, args):`
			`predictor = init_predictor(args)`


			`def start_server(config, args):`
			`"""Start the ASR server"""`
			`dataset = ManifestDataset(`
			`config.data.test_manifest,`
			`config.data.vocab_filepath,`
			`config.data.mean_std_filepath,`
			`augmentation_config="{}",`
			`max_duration=config.data.max_duration,`
			`min_duration=config.data.min_duration,`
			`stride_ms=config.data.stride_ms,`
			`window_ms=config.data.window_ms,`
			`n_fft=config.data.n_fft,`
			`max_freq=config.data.max_freq,`
			`target_sample_rate=config.data.target_sample_rate,`
			`specgram_type=config.data.specgram_type,`
			`use_dB_normalization=config.data.use_dB_normalization,`
			`target_dB=config.data.target_dB,`
			`random_seed=config.data.random_seed,`
			`keep_transcription_text=True)`

			`model = DeepSpeech2Model.from_pretrained(dataset, config,`
			`args.checkpoint_path)`
			`model.eval()`

			`# prepare ASR inference handler`
			`def file_to_transcript(filename):`
			`feature = dataset.process_utterance(filename, "")`
			`audio = np.array([feature[0]]).astype('float32') #[1, D, T]`
			`audio_len = feature[0].shape[1]`
			`audio_len = np.array([audio_len]).astype('int64') # [1]`

			`result_transcript = model.decode(`
			`paddle.to_tensor(audio),`
			`paddle.to_tensor(audio_len),`
			`vocab_list=dataset.vocab_list,`
			`decoding_method=config.decoding.decoding_method,`
			`lang_model_path=config.decoding.lang_model_path,`
			`beam_alpha=config.decoding.alpha,`
			`beam_beta=config.decoding.beta,`
			`beam_size=config.decoding.beam_size,`
			`cutoff_prob=config.decoding.cutoff_prob,`
			`cutoff_top_n=config.decoding.cutoff_top_n,`
			`num_processes=config.decoding.num_proc_bsearch)`
			`return result_transcript[0]`

			`# warming up with utterrances sampled from Librispeech`
			`print('-----------------------------------------------------------')`
			`print('Warming up ...')`
			`warm_up_test(`
			`audio_process_handler=file_to_transcript,`
			`manifest_path=args.warmup_manifest,`
			`num_test_cases=3)`
			`print('-----------------------------------------------------------')`

			`# start the server`
			`server = AsrTCPServer(`
			`server_address=(args.host_ip, args.host_port),`
			`RequestHandlerClass=AsrRequestHandler,`
			`speech_save_dir=args.speech_save_dir,`
			`audio_process_handler=file_to_transcript)`
			`print("ASR Server Started.")`
			`server.serve_forever()`


			`def main(config, args):`
			`start_server(config, args)`


			`if __name__ == "__main__":`
			`parser = default_argument_parser()`
			`add_arg = functools.partial(add_arguments, argparser=parser)`
			`# yapf: disable`
			`add_arg('host_ip', str,`
			`'localhost',`
			`"Server's IP address.")`
			`add_arg('host_port', int, 8086, "Server's IP port.")`
			`add_arg('speech_save_dir', str,`
			`'demo_cache',`
			`"Directory to save demo audios.")`
			`add_arg('warmup_manifest', str, None, "Filepath of manifest to warm up.")`
			`add_arg(`
			`"--model_file",`
			`type=str,`
			`default="",`
			`help="Model filename, Specify this when your model is a combined model."`
			`)`
			`add_arg(`
			`"--params_file",`
			`type=str,`
			`default="",`
			`help=`
			`"Parameter filename, Specify this when your model is a combined model."`
			`)`
			`add_arg(`
			`"--model_dir",`
			`type=str,`
			`default=None,`
			`help=`
			`"Model dir, If you load a non-combined model, specify the directory of the model."`
			`)`
			`add_arg("--use_gpu",`
			`type=bool,`
			`default=False,`
			`help="Whether use gpu.")`
			`args = parser.parse_args()`
			`print_arguments(args)`

			`# https://yaml.org/type/float.html`
			`config = get_cfg_defaults()`
			`if args.config:`
			`config.merge_from_file(args.config)`
			`if args.opts:`
			`config.merge_from_list(args.opts)`
			`config.freeze()`
			`print(config)`

			`args.warmup_manifest = config.data.test_manifest`
			`print_arguments(args)`

			`if args.dump_config:`
			`with open(args.dump_config, 'w') as f:`
			`print(config, file=f)`

			`main(config, args)`