{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/ssd5/zhanghui/DeepSpeech2.x\n" ] }, { "data": { "text/plain": [ "'/home/ssd5/zhanghui/DeepSpeech2.x'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%cd ..\n", "%pwd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2021-03-26 02:55:23,873 - WARNING - register user softmax to paddle, remove this when fixed!\n", "2021-03-26 02:55:23,875 - WARNING - register user sigmoid to paddle, remove this when fixed!\n", "2021-03-26 02:55:23,875 - WARNING - register user relu to paddle, remove this when fixed!\n", "2021-03-26 02:55:23,876 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n", "2021-03-26 02:55:23,876 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n", "2021-03-26 02:55:23,877 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", "2021-03-26 02:55:23,877 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", "2021-03-26 02:55:23,878 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,878 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,879 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,880 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,880 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,881 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,881 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,882 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,882 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n", "2021-03-26 02:55:23,883 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n", "2021-03-26 02:55:23,883 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", "2021-03-26 02:55:23,884 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n", "2021-03-26 02:55:23,884 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n", "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", " from numpy.dual import register_func\n", "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n" ] } ], "source": [ "import os\n", "import time\n", "import argparse\n", "import functools\n", "import paddle\n", "import numpy as np\n", "\n", "from deepspeech.utils.socket_server import warm_up_test\n", "from deepspeech.utils.socket_server import AsrTCPServer\n", "from deepspeech.utils.socket_server import AsrRequestHandler\n", "\n", "from deepspeech.training.cli import default_argument_parser\n", "from deepspeech.exps.deepspeech2.config import get_cfg_defaults\n", "\n", "from deepspeech.frontend.utility import read_manifest\n", "from deepspeech.utils.utility import add_arguments, print_arguments\n", "\n", "from deepspeech.models.deepspeech2 import DeepSpeech2Model\n", "from deepspeech.models.deepspeech2 import DeepSpeech2InferModel\n", "from deepspeech.io.dataset import ManifestDataset\n", "\n", "\n", "\n", "from deepspeech.frontend.utility import read_manifest" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0.0\n", "e7f28d6c0db54eb9c9a810612300b526687e56a6\n", "OFF\n", "OFF\n", "commit: e7f28d6c0db54eb9c9a810612300b526687e56a6\n", "None\n", "0\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/ssd5/zhanghui/DeepSpeech2.x/tools/venv-dev/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n" ] }, { "data": { "text/plain": [ "['__builtins__',\n", " '__cached__',\n", " '__doc__',\n", " '__file__',\n", " '__loader__',\n", " '__name__',\n", " '__package__',\n", " '__spec__',\n", " 'commit',\n", " 'full_version',\n", " 'istaged',\n", " 'major',\n", " 'minor',\n", " 'mkl',\n", " 'patch',\n", " 'rc',\n", " 'show',\n", " 'with_mkl']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(paddle.__version__)\n", "print(paddle.version.commit)\n", "print(paddle.version.with_mkl)\n", "print(paddle.version.mkl())\n", "print(paddle.version.show())\n", "print(paddle.version.patch)\n", "dir(paddle.version)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data:\n", " augmentation_config: conf/augmentation.config\n", " batch_size: 64\n", " dev_manifest: data/manifest.dev\n", " keep_transcription_text: False\n", " max_duration: 27.0\n", " max_freq: None\n", " mean_std_filepath: examples/aishell/data/mean_std.npz\n", " min_duration: 0.0\n", " n_fft: None\n", " num_workers: 0\n", " random_seed: 0\n", " shuffle_method: batch_shuffle\n", " sortagrad: True\n", " specgram_type: linear\n", " stride_ms: 10.0\n", " target_dB: -20\n", " target_sample_rate: 16000\n", " test_manifest: examples/aishell/data/manifest.test\n", " train_manifest: data/manifest.train\n", " use_dB_normalization: True\n", " vocab_filepath: examples/aishell/data/vocab.txt\n", " window_ms: 20.0\n", "decoding:\n", " alpha: 2.6\n", " batch_size: 128\n", " beam_size: 300\n", " beta: 5.0\n", " cutoff_prob: 0.99\n", " cutoff_top_n: 40\n", " decoding_method: ctc_beam_search\n", " error_rate_type: cer\n", " lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm\n", " num_proc_bsearch: 10\n", "model:\n", " num_conv_layers: 2\n", " num_rnn_layers: 3\n", " rnn_layer_size: 1024\n", " share_rnn_weights: False\n", " use_gru: True\n", "training:\n", " global_grad_clip: 5.0\n", " lr: 0.0005\n", " lr_decay: 0.83\n", " n_epoch: 30\n", " weight_decay: 1e-06\n", "----------- Configuration Arguments -----------\n", "checkpoint_path: examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725\n", "config: examples/aishell/conf/deepspeech2.yaml\n", "device: gpu\n", "dump_config: None\n", "export_path: None\n", "host_ip: localhost\n", "host_port: 8086\n", "model_dir: None\n", "model_file: examples/aishell/jit.model.pdmodel\n", "nprocs: 1\n", "opts: ['data.test_manifest', 'examples/aishell/data/manifest.test', 'data.mean_std_filepath', 'examples/aishell/data/mean_std.npz', 'data.vocab_filepath', 'examples/aishell/data/vocab.txt']\n", "output: None\n", "params_file: examples/aishell/jit.model.pdiparams\n", "speech_save_dir: demo_cache\n", "use_gpu: False\n", "warmup_manifest: examples/aishell/data/manifest.test\n", "------------------------------------------------\n" ] } ], "source": [ "parser = default_argument_parser()\n", "add_arg = functools.partial(add_arguments, argparser=parser)\n", "add_arg('host_ip', str,\n", " 'localhost',\n", " \"Server's IP address.\")\n", "add_arg('host_port', int, 8086, \"Server's IP port.\")\n", "add_arg('speech_save_dir', str,\n", " 'demo_cache',\n", " \"Directory to save demo audios.\")\n", "add_arg('warmup_manifest', \n", " str, \n", " \"examples/aishell/data/manifest.test\", \n", " \"Filepath of manifest to warm up.\")\n", "add_arg(\n", " \"--model_file\",\n", " type=str,\n", " default=\"examples/aishell/jit.model.pdmodel\",\n", " help=\"Model filename, Specify this when your model is a combined model.\"\n", ")\n", "add_arg(\n", " \"--params_file\",\n", " type=str,\n", " default=\"examples/aishell/jit.model.pdiparams\",\n", " help=\n", " \"Parameter filename, Specify this when your model is a combined model.\"\n", ")\n", "add_arg(\n", " \"--model_dir\",\n", " type=str,\n", " default=None,\n", " help=\n", " \"Model dir, If you load a non-combined model, specify the directory of the model.\"\n", ")\n", "add_arg(\"--use_gpu\",type=bool,default=False, help=\"Whether use gpu.\")\n", "\n", "\n", "args = parser.parse_args(\n", " \"--checkpoint_path examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725 --config examples/aishell/conf/deepspeech2.yaml --opts data.test_manifest examples/aishell/data/manifest.test data.mean_std_filepath examples/aishell/data/mean_std.npz data.vocab_filepath examples/aishell/data/vocab.txt\".split()\n", ")\n", "\n", "\n", "config = get_cfg_defaults()\n", "if args.config:\n", " config.merge_from_file(args.config)\n", "if args.opts:\n", " config.merge_from_list(args.opts)\n", "config.freeze()\n", "print(config)\n", "\n", "args.warmup_manifest = config.data.test_manifest\n", "\n", "print_arguments(args)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "dataset = ManifestDataset(\n", " config.data.test_manifest,\n", " config.data.unit_type,\n", " config.data.vocab_filepath,\n", " config.data.mean_std_filepath,\n", " augmentation_config=\"{}\",\n", " max_duration=config.data.max_duration,\n", " min_duration=config.data.min_duration,\n", " stride_ms=config.data.stride_ms,\n", " window_ms=config.data.window_ms,\n", " n_fft=config.data.n_fft,\n", " max_freq=config.data.max_freq,\n", " target_sample_rate=config.data.target_sample_rate,\n", " specgram_type=config.data.specgram_type,\n", " feat_dim=config.data.feat_dim,\n", " delta_delta=config.data.delat_delta,\n", " use_dB_normalization=config.data.use_dB_normalization,\n", " target_dB=config.data.target_dB,\n", " random_seed=config.data.random_seed,\n", " keep_transcription_text=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2021-03-26 02:55:57,930 - INFO - [checkpoint] Rank 0: loaded model from examples/aishell/ckpt-loss2e-3-0.83-5/checkpoints/step-11725.pdparams\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "layer summary:\n", "encoder.conv.conv_in.conv.weight|[32, 1, 41, 11]|14432\n", "encoder.conv.conv_in.bn.weight|[32]|32\n", "encoder.conv.conv_in.bn.bias|[32]|32\n", "encoder.conv.conv_in.bn._mean|[32]|32\n", "encoder.conv.conv_in.bn._variance|[32]|32\n", "encoder.conv.conv_stack.0.conv.weight|[32, 32, 21, 11]|236544\n", "encoder.conv.conv_stack.0.bn.weight|[32]|32\n", "encoder.conv.conv_stack.0.bn.bias|[32]|32\n", "encoder.conv.conv_stack.0.bn._mean|[32]|32\n", "encoder.conv.conv_stack.0.bn._variance|[32]|32\n", "encoder.rnn.rnn_stacks.0.fw_fc.weight|[1312, 3072]|4030464\n", "encoder.rnn.rnn_stacks.0.fw_bn.weight|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.fw_bn.bias|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.fw_bn._mean|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.fw_bn._variance|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.bw_fc.weight|[1312, 3072]|4030464\n", "encoder.rnn.rnn_stacks.0.bw_bn.weight|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.bw_bn.bias|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.bw_bn._mean|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.bw_bn._variance|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.fw_cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.0.fw_cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.bw_cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.0.bw_cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.0.fw_rnn.cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.0.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.0.bw_rnn.cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.fw_fc.weight|[2048, 3072]|6291456\n", "encoder.rnn.rnn_stacks.1.fw_bn.weight|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.fw_bn.bias|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.fw_bn._mean|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.fw_bn._variance|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.bw_fc.weight|[2048, 3072]|6291456\n", "encoder.rnn.rnn_stacks.1.bw_bn.weight|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.bw_bn.bias|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.bw_bn._mean|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.bw_bn._variance|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.fw_cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.1.fw_cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.bw_cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.1.bw_cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.1.fw_rnn.cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.1.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.1.bw_rnn.cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.fw_fc.weight|[2048, 3072]|6291456\n", "encoder.rnn.rnn_stacks.2.fw_bn.weight|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.fw_bn.bias|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.fw_bn._mean|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.fw_bn._variance|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.bw_fc.weight|[2048, 3072]|6291456\n", "encoder.rnn.rnn_stacks.2.bw_bn.weight|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.bw_bn.bias|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.bw_bn._mean|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.bw_bn._variance|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.fw_cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.2.fw_cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.bw_cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.2.bw_cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.fw_rnn.cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.2.fw_rnn.cell.bias_hh|[3072]|3072\n", "encoder.rnn.rnn_stacks.2.bw_rnn.cell.weight_hh|[3072, 1024]|3145728\n", "encoder.rnn.rnn_stacks.2.bw_rnn.cell.bias_hh|[3072]|3072\n", "decoder.ctc_lo.weight|[2048, 4300]|8806400\n", "decoder.ctc_lo.bias|[4300]|4300\n", "layer has 66 parameters, 80148012 elements.\n" ] } ], "source": [ "model = DeepSpeech2InferModel.from_pretrained(dataset, config,\n", " args.checkpoint_path)\n", "model.eval()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "examples/aishell/jit.model.pdmodel\n", "examples/aishell/jit.model.pdiparams\n", "0\n", "False\n" ] } ], "source": [ "\n", "from paddle.inference import Config\n", "from paddle.inference import PrecisionType\n", "from paddle.inference import create_predictor\n", "\n", "args.use_gpu=False\n", "paddle.set_device('cpu')\n", "\n", "def init_predictor(args):\n", " if args.model_dir is not None:\n", " config = Config(args.model_dir)\n", " else:\n", " config = Config(args.model_file, args.params_file)\n", "\n", " if args.use_gpu:\n", " config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)\n", "# config.enable_tensorrt_engine(precision_mode=PrecisionType.Float32,\n", "# use_calib_mode=True) # 开启TensorRT预测,精度为fp32,开启int8离线量化\n", " else:\n", " # If not specific mkldnn, you can set the blas thread.\n", " # The thread num should not be greater than the number of cores in the CPU.\n", " config.set_cpu_math_library_num_threads(1)\n", " config.enable_mkldnn()\n", " \n", " config.enable_memory_optim()\n", " config.switch_ir_optim(True)\n", " \n", " print(config.model_dir())\n", " print(config.prog_file())\n", " print(config.params_file())\n", " print(config.gpu_device_id())\n", " print(args.use_gpu)\n", " predictor = create_predictor(config)\n", " return predictor\n", "\n", "def run(predictor, audio, audio_len):\n", " # copy img data to input tensor\n", " input_names = predictor.get_input_names()\n", " for i, name in enumerate(input_names):\n", " print(\"input:\", i, name)\n", " \n", " audio_tensor = predictor.get_input_handle('audio')\n", " audio_tensor.reshape(audio.shape)\n", " audio_tensor.copy_from_cpu(audio.copy())\n", " \n", " audiolen_tensor = predictor.get_input_handle('audio_len')\n", " audiolen_tensor.reshape(audio_len.shape)\n", " audiolen_tensor.copy_from_cpu(audio_len.copy())\n", "\n", " output_names = predictor.get_output_names()\n", " for i, name in enumerate(output_names):\n", " print(\"output:\", i, name)\n", "\n", " # do the inference\n", " predictor.run()\n", "\n", " results = []\n", " # get out data from output tensor\n", " output_names = predictor.get_output_names()\n", " for i, name in enumerate(output_names):\n", " output_tensor = predictor.get_output_handle(name)\n", " output_data = output_tensor.copy_to_cpu()\n", " results.append(output_data)\n", "\n", " return results\n", "\n", "\n", "predictor = init_predictor(args)\n", "\n", "def file_to_transcript(filename):\n", " print(filename)\n", " feature = dataset.process_utterance(filename, \"\")\n", " audio = np.array([feature[0]]).astype('float32') #[1, D, T]\n", " audio_len = feature[0].shape[1]\n", " audio_len = np.array([audio_len]).astype('int64') # [1]\n", " \n", " \n", " i_probs = run(predictor, audio, audio_len)\n", " print('jit:', i_probs[0], type(i_probs[0]))\n", " \n", " audio = paddle.to_tensor(audio)\n", " audio_len = paddle.to_tensor(audio_len)\n", " print(audio.shape)\n", " print(audio_len.shape)\n", " \n", " #eouts, eouts_len = model.encoder(audio, audio_len)\n", " #probs = model.decoder.softmax(eouts)\n", " probs = model.forward(audio, audio_len)\n", " print('paddle:', probs.numpy())\n", " \n", " flag = np.allclose(i_probs[0], probs.numpy())\n", " print(flag)\n", " \n", " return probs\n", "\n", "# result_transcript = model.decode(\n", "# audio,\n", "# audio_len,\n", "# vocab_list=dataset.vocab_list,\n", "# decoding_method=config.decoding.decoding_method,\n", "# lang_model_path=config.decoding.lang_model_path,\n", "# beam_alpha=config.decoding.alpha,\n", "# beam_beta=config.decoding.beta,\n", "# beam_size=config.decoding.beam_size,\n", "# cutoff_prob=config.decoding.cutoff_prob,\n", "# cutoff_top_n=config.decoding.cutoff_top_n,\n", "# num_processes=config.decoding.num_proc_bsearch)\n", "# return result_transcript[0]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Warm-up Test Case %d: %s 0 /home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0124.wav\n", "/home/ssd5/zhanghui/DeepSpeech2.x/examples/aishell/../dataset/aishell/data_aishell/wav/test/S0764/BAC009S0764W0124.wav\n", "input: 0 audio\n", "input: 1 audio_len\n", "output: 0 tmp_75\n", "jit: [[[8.91786298e-12 4.45648032e-12 3.67572750e-09 ... 8.91767563e-12\n", " 8.91573707e-12 4.64317296e-08]\n", " [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n", " 1.55891342e-15 9.99992609e-01]\n", " [1.24638127e-17 7.61802427e-16 2.93265812e-14 ... 1.24633371e-17\n", " 1.24587264e-17 1.00000000e+00]\n", " ...\n", " [4.37488240e-15 2.43676260e-12 1.98770514e-12 ... 4.37479896e-15\n", " 4.37354747e-15 1.00000000e+00]\n", " [3.89334696e-13 1.66754856e-11 1.42900388e-11 ... 3.89329492e-13\n", " 3.89252270e-13 1.00000000e+00]\n", " [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n", " 1.00334095e-10 9.99998808e-01]]] \n", "[1, 161, 522]\n", "[1]\n", "paddle: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n", " 8.91577090e-12 4.64319072e-08]\n", " [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n", " 1.55891342e-15 9.99992609e-01]\n", " [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n", " 1.24587735e-17 1.00000000e+00]\n", " ...\n", " [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n", " 4.37354747e-15 1.00000000e+00]\n", " [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n", " 3.89253761e-13 1.00000000e+00]\n", " [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n", " 1.00334095e-10 9.99998808e-01]]]\n", "False\n" ] } ], "source": [ "manifest = read_manifest(args.warmup_manifest)\n", "\n", "for idx, sample in enumerate(manifest[:1]):\n", " print(\"Warm-up Test Case %d: %s\", idx, sample['audio_filepath'])\n", " start_time = time.time()\n", " transcript = file_to_transcript(sample['audio_filepath'])\n", " finish_time = time.time()\n", "# print(\"Response Time: %f, Transcript: %s\" %\n", "# (finish_time - start_time, transcript))\n", " break" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1, 161, 522) (1,)\n", "input: 0 audio\n", "input: 1 audio_len\n", "output: 0 tmp_75\n", "jit: [[[8.91789680e-12 4.45649724e-12 3.67574149e-09 ... 8.91770945e-12\n", " 8.91577090e-12 4.64319072e-08]\n", " [1.55950222e-15 2.62794089e-14 4.50423509e-12 ... 1.55944271e-15\n", " 1.55891342e-15 9.99992609e-01]\n", " [1.24638599e-17 7.61805339e-16 2.93267472e-14 ... 1.24633842e-17\n", " 1.24587735e-17 1.00000000e+00]\n", " ...\n", " [4.37488240e-15 2.43676737e-12 1.98770514e-12 ... 4.37479896e-15\n", " 4.37354747e-15 1.00000000e+00]\n", " [3.89336187e-13 1.66755481e-11 1.42900925e-11 ... 3.89330983e-13\n", " 3.89253761e-13 1.00000000e+00]\n", " [1.00349985e-10 2.56293708e-10 2.91177582e-10 ... 1.00347876e-10\n", " 1.00334095e-10 9.99998808e-01]]]\n" ] } ], "source": [ "def test(filename):\n", " feature = dataset.process_utterance(filename, \"\")\n", " audio = np.array([feature[0]]).astype('float32') #[1, D, T]\n", " audio_len = feature[0].shape[1]\n", " audio_len = np.array([audio_len]).astype('int64') # [1]\n", " \n", " print(audio.shape, audio_len.shape)\n", "\n", " i_probs = run(predictor, audio, audio_len)\n", " print('jit:', i_probs[0])\n", " return i_probs\n", " \n", "probs = test(sample['audio_filepath'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 2 }