commit
8e348d66b9
@ -0,0 +1,67 @@
|
||||
([简体中文](./README_cn.md)|English)
|
||||
|
||||
# Streaming Speech Synthesis Service
|
||||
|
||||
## Introduction
|
||||
This demo is an implementation of starting the streaming speech synthesis service and accessing the service.
|
||||
|
||||
`Server` must be started in the docker, while `Client` does not have to be in the docker.
|
||||
|
||||
**The streaming_tts_serving under the path of this article ($PWD) contains the configuration and code of the model, which needs to be mapped to the docker for use.**
|
||||
|
||||
## Usage
|
||||
### 1. Server
|
||||
#### 1.1 Docker
|
||||
|
||||
```bash
|
||||
docker pull registry.baidubce.com/paddlepaddle/fastdeploy_serving_cpu_only:22.09
|
||||
docker run -dit --net=host --name fastdeploy --shm-size="1g" -v $PWD:/models registry.baidubce.com/paddlepaddle/fastdeploy_serving_cpu_only:22.09
|
||||
docker exec -it -u root fastdeploy bash
|
||||
```
|
||||
|
||||
#### 1.2 Installation(inside the docker)
|
||||
```bash
|
||||
apt-get install build-essential python3-dev libssl-dev libffi-dev libxml2 libxml2-dev libxslt1-dev zlib1g-dev libsndfile1 language-pack-zh-hans wget zip
|
||||
pip3 install paddlespeech
|
||||
export LC_ALL="zh_CN.UTF-8"
|
||||
export LANG="zh_CN.UTF-8"
|
||||
export LANGUAGE="zh_CN:zh:en_US:en"
|
||||
```
|
||||
|
||||
#### 1.3 Download models(inside the docker)
|
||||
```bash
|
||||
cd /models/streaming_tts_serving/1
|
||||
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip
|
||||
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip
|
||||
unzip fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip
|
||||
unzip mb_melgan_csmsc_onnx_0.2.0.zip
|
||||
```
|
||||
**For the convenience of users, we recommend that you use the command `docker -v` to map $PWD (streaming_tts_service and the configuration and code of the model contained therein) to the docker path `/models`. You can also use other methods, but regardless of which method you use, the final model directory and structure in the docker are shown in the following figure.**
|
||||
|
||||
<p align="center">
|
||||
<img src="./tree.png" />
|
||||
</p>
|
||||
|
||||
#### 1.4 Start the server(inside the docker)
|
||||
|
||||
```bash
|
||||
fastdeployserver --model-repository=/models --model-control-mode=explicit --load-model=streaming_tts_serving
|
||||
```
|
||||
Arguments:
|
||||
- `model-repository`(required): Path of model storage.
|
||||
- `model-control-mode`(required): The mode of loading the model. At present, you can use 'explicit'.
|
||||
- `load-model`(required): Name of the model to be loaded.
|
||||
- `http-port`(optional): Port for http service. Default: `8000`. This is not used in our example.
|
||||
- `grpc-port`(optional): Port for grpc service. Default: `8001`.
|
||||
- `metrics-port`(optional): Port for metrics service. Default: `8002`. This is not used in our example.
|
||||
|
||||
### 2. Client
|
||||
#### 2.1 Installation
|
||||
```bash
|
||||
pip3 install tritonclient[all]
|
||||
```
|
||||
|
||||
#### 2.2 Send request
|
||||
```bash
|
||||
python3 /models/streaming_tts_serving/stream_client.py
|
||||
```
|
@ -0,0 +1,33 @@
|
||||
name: "streaming_tts_serving"
|
||||
backend: "python"
|
||||
max_batch_size: 0
|
||||
model_transaction_policy {
|
||||
decoupled: True
|
||||
}
|
||||
input [
|
||||
{
|
||||
name: "INPUT_0"
|
||||
data_type: TYPE_STRING
|
||||
dims: [ 1 ]
|
||||
}
|
||||
]
|
||||
|
||||
output [
|
||||
{
|
||||
name: "OUTPUT_0"
|
||||
data_type: TYPE_FP32
|
||||
dims: [ -1, 1 ]
|
||||
},
|
||||
{
|
||||
name: "status"
|
||||
data_type: TYPE_BOOL
|
||||
dims: [ 1 ]
|
||||
}
|
||||
]
|
||||
|
||||
instance_group [
|
||||
{
|
||||
count: 1
|
||||
kind: KIND_CPU
|
||||
}
|
||||
]
|
@ -0,0 +1,117 @@
|
||||
#!/usr/bin/env python
|
||||
import argparse
|
||||
import queue
|
||||
import sys
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import tritonclient.grpc as grpcclient
|
||||
from tritonclient.utils import *
|
||||
|
||||
FLAGS = None
|
||||
|
||||
|
||||
class UserData:
|
||||
def __init__(self):
|
||||
self._completed_requests = queue.Queue()
|
||||
|
||||
|
||||
# Define the callback function. Note the last two parameters should be
|
||||
# result and error. InferenceServerClient would povide the results of an
|
||||
# inference as grpcclient.InferResult in result. For successful
|
||||
# inference, error will be None, otherwise it will be an object of
|
||||
# tritonclientutils.InferenceServerException holding the error details
|
||||
def callback(user_data, result, error):
|
||||
if error:
|
||||
user_data._completed_requests.put(error)
|
||||
else:
|
||||
user_data._completed_requests.put(result)
|
||||
|
||||
|
||||
def async_stream_send(triton_client, values, request_id, model_name):
|
||||
|
||||
infer_inputs = []
|
||||
outputs = []
|
||||
for idx, data in enumerate(values):
|
||||
data = np.array([data.encode('utf-8')], dtype=np.object_)
|
||||
infer_input = grpcclient.InferInput('INPUT_0', [len(data)], "BYTES")
|
||||
infer_input.set_data_from_numpy(data)
|
||||
infer_inputs.append(infer_input)
|
||||
|
||||
outputs.append(grpcclient.InferRequestedOutput('OUTPUT_0'))
|
||||
# Issue the asynchronous sequence inference.
|
||||
triton_client.async_stream_infer(
|
||||
model_name=model_name,
|
||||
inputs=infer_inputs,
|
||||
outputs=outputs,
|
||||
request_id=request_id)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
'-v',
|
||||
'--verbose',
|
||||
action="store_true",
|
||||
required=False,
|
||||
default=False,
|
||||
help='Enable verbose output')
|
||||
parser.add_argument(
|
||||
'-u',
|
||||
'--url',
|
||||
type=str,
|
||||
required=False,
|
||||
default='localhost:8001',
|
||||
help='Inference server URL and it gRPC port. Default is localhost:8001.')
|
||||
|
||||
FLAGS = parser.parse_args()
|
||||
|
||||
# We use custom "sequence" models which take 1 input
|
||||
# value. The output is the accumulated value of the inputs. See
|
||||
# src/custom/sequence.
|
||||
model_name = "streaming_tts_serving"
|
||||
|
||||
values = ["哈哈哈哈"]
|
||||
|
||||
request_id = "0"
|
||||
|
||||
string_result0_list = []
|
||||
|
||||
user_data = UserData()
|
||||
|
||||
# It is advisable to use client object within with..as clause
|
||||
# when sending streaming requests. This ensures the client
|
||||
# is closed when the block inside with exits.
|
||||
with grpcclient.InferenceServerClient(
|
||||
url=FLAGS.url, verbose=FLAGS.verbose) as triton_client:
|
||||
try:
|
||||
# Establish stream
|
||||
triton_client.start_stream(callback=partial(callback, user_data))
|
||||
# Now send the inference sequences...
|
||||
async_stream_send(triton_client, values, request_id, model_name)
|
||||
except InferenceServerException as error:
|
||||
print(error)
|
||||
sys.exit(1)
|
||||
|
||||
# Retrieve results...
|
||||
recv_count = 0
|
||||
result_dict = {}
|
||||
status = True
|
||||
while True:
|
||||
data_item = user_data._completed_requests.get()
|
||||
if type(data_item) == InferenceServerException:
|
||||
raise data_item
|
||||
else:
|
||||
this_id = data_item.get_response().id
|
||||
if this_id not in result_dict.keys():
|
||||
result_dict[this_id] = []
|
||||
result_dict[this_id].append((recv_count, data_item))
|
||||
sub_wav = data_item.as_numpy('OUTPUT_0')
|
||||
status = data_item.as_numpy('status')
|
||||
print('sub_wav = ', sub_wav, "subwav.shape = ", sub_wav.shape)
|
||||
print('status = ', status)
|
||||
if status[0] == 1:
|
||||
break
|
||||
recv_count += 1
|
||||
|
||||
print("PASS: stream_client")
|
After Width: | Height: | Size: 24 KiB |
@ -0,0 +1,44 @@
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
dataset_type: Ernie
|
||||
train_path: data/iwslt2012_zh/train.txt
|
||||
dev_path: data/iwslt2012_zh/dev.txt
|
||||
test_path: data/iwslt2012_zh/test.txt
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
data_params:
|
||||
pretrained_token: ernie-3.0-base-zh
|
||||
punc_path: data/iwslt2012_zh/punc_vocab
|
||||
seq_len: 100
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model_type: ErnieLinear
|
||||
model:
|
||||
pretrained_token: ernie-3.0-base-zh
|
||||
num_classes: 4
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer_params:
|
||||
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||
|
||||
scheduler_params:
|
||||
learning_rate: 1.0e-5 # learning rate.
|
||||
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 20
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,44 @@
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
dataset_type: Ernie
|
||||
train_path: data/iwslt2012_zh/train.txt
|
||||
dev_path: data/iwslt2012_zh/dev.txt
|
||||
test_path: data/iwslt2012_zh/test.txt
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
data_params:
|
||||
pretrained_token: ernie-3.0-medium-zh
|
||||
punc_path: data/iwslt2012_zh/punc_vocab
|
||||
seq_len: 100
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model_type: ErnieLinear
|
||||
model:
|
||||
pretrained_token: ernie-3.0-medium-zh
|
||||
num_classes: 4
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer_params:
|
||||
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||
|
||||
scheduler_params:
|
||||
learning_rate: 1.0e-5 # learning rate.
|
||||
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 20
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,44 @@
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
dataset_type: Ernie
|
||||
train_path: data/iwslt2012_zh/train.txt
|
||||
dev_path: data/iwslt2012_zh/dev.txt
|
||||
test_path: data/iwslt2012_zh/test.txt
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
data_params:
|
||||
pretrained_token: ernie-3.0-mini-zh
|
||||
punc_path: data/iwslt2012_zh/punc_vocab
|
||||
seq_len: 100
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model_type: ErnieLinear
|
||||
model:
|
||||
pretrained_token: ernie-3.0-mini-zh
|
||||
num_classes: 4
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer_params:
|
||||
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||
|
||||
scheduler_params:
|
||||
learning_rate: 1.0e-5 # learning rate.
|
||||
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 20
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,44 @@
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
dataset_type: Ernie
|
||||
train_path: data/iwslt2012_zh/train.txt
|
||||
dev_path: data/iwslt2012_zh/dev.txt
|
||||
test_path: data/iwslt2012_zh/test.txt
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
data_params:
|
||||
pretrained_token: ernie-3.0-nano-zh
|
||||
punc_path: data/iwslt2012_zh/punc_vocab
|
||||
seq_len: 100
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model_type: ErnieLinear
|
||||
model:
|
||||
pretrained_token: ernie-3.0-nano-zh
|
||||
num_classes: 4
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer_params:
|
||||
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||
|
||||
scheduler_params:
|
||||
learning_rate: 1.0e-5 # learning rate.
|
||||
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 20
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,44 @@
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
dataset_type: Ernie
|
||||
train_path: data/iwslt2012_zh/train.txt
|
||||
dev_path: data/iwslt2012_zh/dev.txt
|
||||
test_path: data/iwslt2012_zh/test.txt
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
data_params:
|
||||
pretrained_token: ernie-tiny
|
||||
punc_path: data/iwslt2012_zh/punc_vocab
|
||||
seq_len: 100
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model_type: ErnieLinear
|
||||
model:
|
||||
pretrained_token: ernie-tiny
|
||||
num_classes: 4
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer_params:
|
||||
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||
|
||||
scheduler_params:
|
||||
learning_rate: 1.0e-5 # learning rate.
|
||||
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 20
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,8 @@
|
||||
# LibriSpeech
|
||||
|
||||
## Wav2VecASR
|
||||
train: Epoch 1, 1*V100-32G, batchsize:10
|
||||
|
||||
| Model | Params | Config | Augmentation| Test set | Decode method | WER |
|
||||
| --- | --- | --- | --- | --- | --- | --- |
|
||||
| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018887 |
|
@ -0,0 +1,4 @@
|
||||
process:
|
||||
# use raw audio
|
||||
- type: wav_process
|
||||
dither: 0.0
|
@ -0,0 +1,4 @@
|
||||
decode_batch_size: 1
|
||||
error_rate_type: wer
|
||||
decoding_method: ctc_greedy_search # 'ctc_greedy_search', 'ctc_prefix_beam_search'
|
||||
beam_size: 10
|
@ -0,0 +1,120 @@
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
freeze_wav2vec2: True
|
||||
normalize_wav: True
|
||||
output_norm: True
|
||||
dnn_blocks: 2
|
||||
dnn_neurons: 1024
|
||||
blank_id: 0
|
||||
ctc_dropout_rate: 0.0
|
||||
wav2vec2_params_path: "exp/wav2vec2/wav2vec2-large-960h-lv60-self.pdparams"
|
||||
|
||||
############################################
|
||||
# Wav2Vec2.0 #
|
||||
############################################
|
||||
vocab_size: 32
|
||||
hidden_size: 1024
|
||||
num_hidden_layers: 24
|
||||
num_attention_heads: 16
|
||||
intermediate_size: 4096
|
||||
hidden_act: "gelu"
|
||||
hidden_dropout: 0.1
|
||||
activation_dropout: 0.1
|
||||
attention_dropout: 0.1
|
||||
feat_proj_dropout: 0.1
|
||||
feat_quantizer_dropout: 0.0
|
||||
final_dropout: 0.1
|
||||
layerdrop: 0.1
|
||||
initializer_range: 0.02
|
||||
layer_norm_eps: 1e-5
|
||||
feat_extract_norm: "layer"
|
||||
feat_extract_activation: "gelu"
|
||||
conv_dim: [512, 512, 512, 512, 512, 512, 512]
|
||||
conv_stride: [5, 2, 2, 2, 2, 2, 2]
|
||||
conv_kernel: [10, 3, 3, 3, 3, 2, 2]
|
||||
conv_bias: True
|
||||
num_conv_pos_embeddings: 128
|
||||
num_conv_pos_embedding_groups: 16
|
||||
do_stable_layer_norm: True
|
||||
apply_spec_augment: False
|
||||
mask_time_prob: 0.05
|
||||
mask_time_length: 10
|
||||
mask_time_min_masks: 2
|
||||
mask_feature_prob: 0.0
|
||||
mask_feature_length: 10
|
||||
mask_feature_min_masks: 0
|
||||
num_codevectors_per_group: 320
|
||||
num_codevector_groups: 2
|
||||
contrastive_logits_temperature: 0.1
|
||||
num_negatives: 100
|
||||
codevector_dim: 256
|
||||
proj_codevector_dim: 256
|
||||
diversity_loss_weight: 0.1
|
||||
ctc_loss_reduction: "sum"
|
||||
ctc_zero_infinity: False
|
||||
use_weighted_layer_sum: False
|
||||
pad_token_id: 0
|
||||
bos_token_id: 1
|
||||
eos_token_id: 2
|
||||
add_adapter: False
|
||||
adapter_kernel_size: 3
|
||||
adapter_stride: 2
|
||||
num_adapter_layers: 3
|
||||
output_hidden_size: None
|
||||
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test-clean
|
||||
|
||||
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
unit_type: 'char'
|
||||
mean_std_filepath: ""
|
||||
preprocess_config: conf/preprocess.yaml
|
||||
sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for 'other' epochs
|
||||
batch_size: 10 # Different batch_size may cause large differences in results
|
||||
maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced
|
||||
maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced
|
||||
minibatches: 0 # for debug
|
||||
batch_count: auto
|
||||
batch_bins: 0
|
||||
batch_frames_in: 0
|
||||
batch_frames_out: 0
|
||||
batch_frames_inout: 0
|
||||
num_workers: 0
|
||||
subsampling_factor: 1
|
||||
num_encs: 1
|
||||
dist_sampler: True
|
||||
shortest_first: True
|
||||
return_lens_rate: True
|
||||
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 1
|
||||
accum_grad: 1
|
||||
global_grad_clip: 3.0
|
||||
model_optim: adadelta
|
||||
model_optim_conf:
|
||||
lr: 0.9
|
||||
epsilon: 1.0e-6
|
||||
rho: 0.95
|
||||
scheduler: constantlr
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
||||
lr_decay: 1.0
|
||||
log_interval: 1
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
||||
augment: True
|
||||
|
||||
|
@ -0,0 +1,110 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=-1
|
||||
stop_stage=100
|
||||
|
||||
unit_type=char
|
||||
dict_dir=data/lang_char
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||
|
||||
mkdir -p data
|
||||
mkdir -p ${dict_dir}
|
||||
TARGET_DIR=${MAIN_ROOT}/dataset
|
||||
mkdir -p ${TARGET_DIR}
|
||||
|
||||
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||
# download data, generate manifests
|
||||
python3 ${TARGET_DIR}/librispeech/librispeech.py \
|
||||
--manifest_prefix="data/manifest" \
|
||||
--target_dir="${TARGET_DIR}/librispeech" \
|
||||
--full_download="True"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Prepare LibriSpeech failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
mv data/manifest.${set} data/manifest.${set}.raw
|
||||
done
|
||||
|
||||
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
|
||||
for set in train-clean-100 train-clean-360 train-other-500; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.train.raw
|
||||
done
|
||||
|
||||
for set in dev-clean dev-other; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.dev.raw
|
||||
done
|
||||
|
||||
for set in test-clean test-other; do
|
||||
cat data/manifest.${set}.raw >> data/manifest.test.raw
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# compute mean and stddev for normalizer
|
||||
num_workers=$(nproc)
|
||||
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
|
||||
--manifest_path="data/manifest.train.raw" \
|
||||
--num_samples=2000 \
|
||||
--spectrum_type="fbank" \
|
||||
--feat_dim=161 \
|
||||
--delta_delta=false \
|
||||
--sample_rate=16000 \
|
||||
--stride_ms=10 \
|
||||
--window_ms=25 \
|
||||
--use_dB_normalization=False \
|
||||
--num_workers=${num_workers} \
|
||||
--output_path="data/mean_std.json"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Compute mean and stddev failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# build vocabulary
|
||||
python3 ${MAIN_ROOT}/utils/build_vocab.py \
|
||||
--unit_type ${unit_type} \
|
||||
--count_threshold=0 \
|
||||
--vocab_path="${dict_dir}/vocab.txt" \
|
||||
--manifest_paths="data/manifest.train.raw"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Build vocabulary failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# format manifest with tokenids, vocab size
|
||||
for set in train dev test dev-clean dev-other test-clean test-other; do
|
||||
{
|
||||
python3 ${MAIN_ROOT}/utils/format_data.py \
|
||||
--cmvn_path "data/mean_std.json" \
|
||||
--unit_type ${unit_type} \
|
||||
--vocab_path="${dict_dir}/vocab.txt" \
|
||||
--manifest_path="data/manifest.${set}.raw" \
|
||||
--output_path="data/manifest.${set}"
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Formt mnaifest.${set} failed. Terminated."
|
||||
exit 1
|
||||
fi
|
||||
}&
|
||||
done
|
||||
wait
|
||||
fi
|
||||
|
||||
echo "LibriSpeech Data preparation done."
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
mkdir -p exp/wav2vec2
|
||||
echo "Pretrained wav2vec2 model download"
|
||||
wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/wav2vec2-large-960h-lv60-self.pdparams
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,84 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
expdir=exp
|
||||
datadir=data
|
||||
|
||||
train_set=train_960
|
||||
recog_set="test-clean test-other dev-clean dev-other"
|
||||
recog_set="test-clean"
|
||||
|
||||
config_path=$1
|
||||
decode_config_path=$2
|
||||
ckpt_prefix=$3
|
||||
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
# download language model
|
||||
#bash local/download_lm_en.sh
|
||||
#if [ $? -ne 0 ]; then
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
python3 utils/format_rsl.py \
|
||||
--origin_ref data/manifest.test-clean.raw \
|
||||
--trans_ref data/manifest.test-clean.text
|
||||
|
||||
|
||||
for type in ctc_greedy_search; do
|
||||
echo "decoding ${type}"
|
||||
batch_size=16
|
||||
python3 -u ${BIN_DIR}/test.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--decode_cfg ${decode_config_path} \
|
||||
--result_file ${ckpt_prefix}.${type}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decode.decoding_method ${type} \
|
||||
--opts decode.decode_batch_size ${batch_size}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
python3 utils/format_rsl.py \
|
||||
--origin_hyp ${ckpt_prefix}.${type}.rsl \
|
||||
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
|
||||
|
||||
python3 utils/compute-wer.py --char=1 --v=1 \
|
||||
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
|
||||
echo "decoding ${type} done."
|
||||
done
|
||||
|
||||
for type in ctc_prefix_beam_search; do
|
||||
echo "decoding ${type}"
|
||||
batch_size=1
|
||||
python3 -u ${BIN_DIR}/test.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--decode_cfg ${decode_config_path} \
|
||||
--result_file ${ckpt_prefix}.${type}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decode.decoding_method ${type} \
|
||||
--opts decode.decode_batch_size ${batch_size}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
python3 utils/format_rsl.py \
|
||||
--origin_hyp ${ckpt_prefix}.${type}.rsl \
|
||||
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
|
||||
|
||||
python3 utils/compute-wer.py --char=1 --v=1 \
|
||||
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
|
||||
echo "decoding ${type} done."
|
||||
done
|
||||
|
||||
echo "Finished"
|
||||
|
||||
exit 0
|
@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 4 ];then
|
||||
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
decode_config_path=$2
|
||||
ckpt_prefix=$3
|
||||
audio_file=$4
|
||||
|
||||
mkdir -p data
|
||||
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f ${audio_file} ]; then
|
||||
echo "Plase input the right audio_file path"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
chunk_mode=false
|
||||
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
|
||||
chunk_mode=true
|
||||
fi
|
||||
|
||||
# download language model
|
||||
#bash local/download_lm_ch.sh
|
||||
#if [ $? -ne 0 ]; then
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
for type in ctc_greedy_search; do
|
||||
echo "decoding ${type}"
|
||||
batch_size=1
|
||||
output_dir=${ckpt_prefix}
|
||||
mkdir -p ${output_dir}
|
||||
python3 -u ${BIN_DIR}/test_wav.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--decode_cfg ${decode_config_path} \
|
||||
--result_file ${output_dir}/${type}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decode.decoding_method ${type} \
|
||||
--opts decode.decode_batch_size ${batch_size} \
|
||||
--audio_file ${audio_file}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
exit 0
|
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# -lt 2 ] && [ $# -gt 3 ];then
|
||||
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_name=$2
|
||||
ips=$3
|
||||
|
||||
if [ ! $ips ];then
|
||||
ips_config=
|
||||
else
|
||||
ips_config="--ips="${ips}
|
||||
fi
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
# seed may break model convergence
|
||||
seed=1998
|
||||
if [ ${seed} != 0 ]; then
|
||||
export FLAGS_cudnn_deterministic=True
|
||||
fi
|
||||
|
||||
# export FLAGS_cudnn_exhaustive_search=true
|
||||
# export FLAGS_conv_workspace_size_limit=4000
|
||||
export FLAGS_allocator_strategy=naive_best_fit
|
||||
if [ ${ngpu} == 0 ]; then
|
||||
python3 -u ${BIN_DIR}/train.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--output exp/${ckpt_name} \
|
||||
--seed ${seed}
|
||||
else
|
||||
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--output exp/${ckpt_name} \
|
||||
--seed ${seed}
|
||||
fi
|
||||
|
||||
if [ ${seed} != 0 ]; then
|
||||
unset FLAGS_cudnn_deterministic
|
||||
fi
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in training!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,15 @@
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
|
||||
export LC_ALL=C
|
||||
|
||||
export PYTHONDONTWRITEBYTECODE=1
|
||||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||
|
||||
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||
|
||||
|
||||
MODEL=wav2vec2
|
||||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
|
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
. ./path.sh || exit 1;
|
||||
. ./cmd.sh || exit 1;
|
||||
|
||||
gpus=0
|
||||
stage=0
|
||||
stop_stage=0
|
||||
conf_path=conf/wav2vec2ASR.yaml
|
||||
ips= #xx.xx.xx.xx,xx.xx.xx.xx
|
||||
decode_conf_path=conf/tuning/decode.yaml
|
||||
avg_num=1
|
||||
dict_path=data/lang_char/vocab.txt
|
||||
|
||||
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||
|
||||
audio_file=data/demo_002_en.wav
|
||||
|
||||
avg_ckpt=avg_${avg_num}
|
||||
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||
echo "checkpoint name ${ckpt}"
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
bash ./local/data.sh || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `exp` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# avg n best model
|
||||
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# greedy search decoder
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# test a single .wav file
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
|
||||
fi
|
@ -0,0 +1 @@
|
||||
../../../utils
|
@ -0,0 +1,110 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
|
||||
input_dir=./input/SSB0005_mini
|
||||
newdir_name="newdir"
|
||||
new_dir=${input_dir}/${newdir_name}
|
||||
pretrained_model_dir=./pretrained_models/fastspeech2_mix_ckpt_1.2.0
|
||||
mfa_tools=./tools
|
||||
mfa_dir=./mfa_result
|
||||
dump_dir=./dump
|
||||
output_dir=./exp/default
|
||||
lang=zh
|
||||
ngpu=1
|
||||
finetune_config=./conf/finetune.yaml
|
||||
replace_spkid=174 # csmsc: 174, ljspeech: 175, aishell3: 0~173, vctk: 176
|
||||
|
||||
ckpt=snapshot_iter_99300
|
||||
|
||||
gpus=1
|
||||
CUDA_VISIBLE_DEVICES=${gpus}
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
# check oov
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
echo "check oov"
|
||||
python3 local/check_oov.py \
|
||||
--input_dir=${input_dir} \
|
||||
--pretrained_model_dir=${pretrained_model_dir} \
|
||||
--newdir_name=${newdir_name} \
|
||||
--lang=${lang}
|
||||
fi
|
||||
|
||||
# get mfa result
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "get mfa result"
|
||||
python3 local/get_mfa_result.py \
|
||||
--input_dir=${new_dir} \
|
||||
--mfa_dir=${mfa_dir} \
|
||||
--lang=${lang}
|
||||
fi
|
||||
|
||||
# generate durations.txt
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "generate durations.txt"
|
||||
python3 local/generate_duration.py \
|
||||
--mfa_dir=${mfa_dir}
|
||||
fi
|
||||
|
||||
# extract feature
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "extract feature"
|
||||
python3 local/extract_feature.py \
|
||||
--duration_file="./durations.txt" \
|
||||
--input_dir=${new_dir} \
|
||||
--dump_dir=${dump_dir} \
|
||||
--pretrained_model_dir=${pretrained_model_dir} \
|
||||
--replace_spkid=$replace_spkid
|
||||
|
||||
fi
|
||||
|
||||
# create finetune env
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "create finetune env"
|
||||
python3 local/prepare_env.py \
|
||||
--pretrained_model_dir=${pretrained_model_dir} \
|
||||
--output_dir=${output_dir}
|
||||
fi
|
||||
|
||||
# finetune
|
||||
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||
echo "finetune..."
|
||||
python3 local/finetune.py \
|
||||
--pretrained_model_dir=${pretrained_model_dir} \
|
||||
--dump_dir=${dump_dir} \
|
||||
--output_dir=${output_dir} \
|
||||
--ngpu=${ngpu} \
|
||||
--epoch=100 \
|
||||
--finetune_config=${finetune_config}
|
||||
fi
|
||||
|
||||
# synthesize e2e
|
||||
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
||||
echo "in hifigan syn_e2e"
|
||||
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||
--am=fastspeech2_aishell3 \
|
||||
--am_config=${pretrained_model_dir}/default.yaml \
|
||||
--am_ckpt=${output_dir}/checkpoints/${ckpt}.pdz \
|
||||
--am_stat=${pretrained_model_dir}/speech_stats.npy \
|
||||
--voc=hifigan_aishell3 \
|
||||
--voc_config=pretrained_models/hifigan_aishell3_ckpt_0.2.0/default.yaml \
|
||||
--voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
|
||||
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
|
||||
--lang=mix \
|
||||
--text=${BIN_DIR}/../sentences_mix.txt \
|
||||
--output_dir=./test_e2e/ \
|
||||
--phones_dict=${dump_dir}/phone_id_map.txt \
|
||||
--speaker_dict=${dump_dir}/speaker_id_map.txt \
|
||||
--spk_id=$replace_spkid
|
||||
fi
|
||||
|
@ -0,0 +1,101 @@
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
cmvn_file:
|
||||
cmvn_file_type: "json"
|
||||
# encoder related
|
||||
encoder: conformer
|
||||
encoder_conf:
|
||||
output_size: 512 # dimension of attention
|
||||
attention_heads: 8
|
||||
linear_units: 2048 # the number of units of position-wise feed forward
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.0
|
||||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||
normalize_before: True
|
||||
use_cnn_module: True
|
||||
cnn_module_kernel: 15
|
||||
activation_type: swish
|
||||
pos_enc_layer_type: rel_pos
|
||||
selfattention_layer_type: rel_selfattn
|
||||
causal: true
|
||||
use_dynamic_chunk: true
|
||||
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||
use_dynamic_left_chunk: false
|
||||
# decoder related
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 8
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.0
|
||||
src_attention_dropout_rate: 0.0
|
||||
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
reverse_weight: 0.0 # unidecoder
|
||||
length_normalized_loss: false
|
||||
init_type: 'kaiming_uniform'
|
||||
|
||||
# https://yaml.org/type/float.html
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
train_manifest: data/train_l/data.list
|
||||
dev_manifest: data/dev/data.list
|
||||
test_manifest: data/test_meeting/data.list
|
||||
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
use_streaming_data: True
|
||||
unit_type: 'char'
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
preprocess_config: conf/preprocess.yaml
|
||||
spm_model_prefix: ''
|
||||
feat_dim: 80
|
||||
stride_ms: 10.0
|
||||
window_ms: 25.0
|
||||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||
batch_size: 32
|
||||
do_filter: True
|
||||
maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced
|
||||
maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced
|
||||
minlen_in: 10
|
||||
minlen_out: 0
|
||||
minibatches: 0 # for debug
|
||||
batch_count: auto
|
||||
batch_bins: 0
|
||||
batch_frames_in: 0
|
||||
batch_frames_out: 0
|
||||
batch_frames_inout: 0
|
||||
num_workers: 0
|
||||
subsampling_factor: 1
|
||||
num_encs: 1
|
||||
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 26
|
||||
accum_grad: 32
|
||||
global_grad_clip: 5.0
|
||||
dist_sampler: True
|
||||
log_interval: 1
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.001
|
||||
weight_decay: 1.0e-6
|
||||
scheduler: warmuplr
|
||||
scheduler_conf:
|
||||
warmup_steps: 5000
|
||||
lr_decay: 1.0
|
@ -0,0 +1,100 @@
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
cmvn_file:
|
||||
cmvn_file_type: "json"
|
||||
# encoder related
|
||||
encoder: conformer
|
||||
encoder_conf:
|
||||
output_size: 512 # dimension of attention
|
||||
attention_heads: 8
|
||||
linear_units: 2048 # the number of units of position-wise feed forward
|
||||
num_blocks: 12 # the number of encoder blocks
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.1
|
||||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||
normalize_before: True
|
||||
use_cnn_module: True
|
||||
cnn_module_kernel: 15
|
||||
activation_type: swish
|
||||
pos_enc_layer_type: rel_pos
|
||||
selfattention_layer_type: rel_selfattn
|
||||
causal: true
|
||||
use_dynamic_chunk: true
|
||||
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||
use_dynamic_left_chunk: false
|
||||
# decoder related
|
||||
decoder: bitransformer
|
||||
decoder_conf:
|
||||
attention_heads: 8
|
||||
linear_units: 2048
|
||||
num_blocks: 3 # the number of encoder blocks
|
||||
r_num_blocks: 3 #only for bitransformer
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.1
|
||||
src_attention_dropout_rate: 0.1
|
||||
|
||||
# hybrid CTC/attention
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1 # label smoothing option
|
||||
length_normalized_loss: false
|
||||
reverse_weight: 0.3 # only for bitransformer decoder
|
||||
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
train_manifest: data/train_l/data.list
|
||||
dev_manifest: data/dev/data.list
|
||||
test_manifest: data/test_meeting/data.list
|
||||
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
use_stream_data: True
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
unit_type: 'char'
|
||||
preprocess_config: conf/preprocess.yaml
|
||||
spm_model_prefix: ''
|
||||
feat_dim: 80
|
||||
stride_ms: 10.0
|
||||
window_ms: 25.0
|
||||
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||
batch_size: 32
|
||||
do_filter: True
|
||||
maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced
|
||||
maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced
|
||||
minlen_in: 10
|
||||
minlen_out: 0
|
||||
minibatches: 0 # for debug
|
||||
batch_count: auto
|
||||
batch_bins: 0
|
||||
batch_frames_in: 0
|
||||
batch_frames_out: 0
|
||||
batch_frames_inout: 0
|
||||
num_workers: 0
|
||||
subsampling_factor: 1
|
||||
num_encs: 1
|
||||
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 150
|
||||
accum_grad: 8
|
||||
global_grad_clip: 5.0
|
||||
dist_sampler: False
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.002
|
||||
weight_decay: 1.0e-6
|
||||
scheduler: warmuplr
|
||||
scheduler_conf:
|
||||
warmup_steps: 25000
|
||||
lr_decay: 1.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
@ -0,0 +1,12 @@
|
||||
beam_size: 10
|
||||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
||||
reverse_weight: 0.3 # reverse weight for attention rescoring decode mode.
|
||||
decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
|
||||
# <0: for decoding, use full chunk.
|
||||
# >0: for decoding, use fixed chunk size as set.
|
||||
# 0: used for training, it's prohibited here.
|
||||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
||||
simulate_streaming: True # simulate streaming inference. Defaults to False.
|
||||
decode_batch_size: 128
|
||||
error_rate_type: cer
|
@ -1,11 +1,12 @@
|
||||
decode_batch_size: 128
|
||||
error_rate_type: cer
|
||||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||
beam_size: 10
|
||||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
||||
reverse_weight: 0.3 # reverse weight for attention rescoring decode mode.
|
||||
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
|
||||
# <0: for decoding, use full chunk.
|
||||
# >0: for decoding, use fixed chunk size as set.
|
||||
# 0: used for training, it's prohibited here.
|
||||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
||||
simulate_streaming: False # simulate streaming inference. Defaults to False.
|
||||
decode_batch_size: 128
|
||||
error_rate_type: cer
|
||||
|
@ -0,0 +1,59 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 4 ];then
|
||||
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
decode_config_path=$2
|
||||
ckpt_prefix=$3
|
||||
audio_file=$4
|
||||
|
||||
mkdir -p data
|
||||
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
|
||||
if [ $? -ne 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f ${audio_file} ]; then
|
||||
echo "Plase input the right audio_file path"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
chunk_mode=false
|
||||
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
|
||||
chunk_mode=true
|
||||
fi
|
||||
|
||||
# download language model
|
||||
#bash local/download_lm_ch.sh
|
||||
#if [ $? -ne 0 ]; then
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
for type in attention_rescoring; do
|
||||
echo "decoding ${type}"
|
||||
batch_size=1
|
||||
output_dir=${ckpt_prefix}
|
||||
mkdir -p ${output_dir}
|
||||
python3 -u ${BIN_DIR}/quant.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--decode_cfg ${decode_config_path} \
|
||||
--result_file ${output_dir}/${type}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decode.decoding_method ${type} \
|
||||
--opts decode.decode_batch_size ${batch_size} \
|
||||
--audio_file ${audio_file}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
exit 0
|
@ -0,0 +1,224 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Evaluation for U2 model."""
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import paddle
|
||||
import soundfile
|
||||
from paddleslim import PTQ
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from paddlespeech.audio.transform.transformation import Transformation
|
||||
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
|
||||
from paddlespeech.s2t.models.u2 import U2Model
|
||||
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||
from paddlespeech.s2t.utils.log import Log
|
||||
from paddlespeech.s2t.utils.utility import UpdateConfig
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
|
||||
class U2Infer():
|
||||
def __init__(self, config, args):
|
||||
self.args = args
|
||||
self.config = config
|
||||
self.audio_file = args.audio_file
|
||||
|
||||
self.preprocess_conf = config.preprocess_config
|
||||
self.preprocess_args = {"train": False}
|
||||
self.preprocessing = Transformation(self.preprocess_conf)
|
||||
self.text_feature = TextFeaturizer(
|
||||
unit_type=config.unit_type,
|
||||
vocab=config.vocab_filepath,
|
||||
spm_model_prefix=config.spm_model_prefix)
|
||||
|
||||
paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
|
||||
|
||||
# model
|
||||
model_conf = config
|
||||
with UpdateConfig(model_conf):
|
||||
model_conf.input_dim = config.feat_dim
|
||||
model_conf.output_dim = self.text_feature.vocab_size
|
||||
model = U2Model.from_config(model_conf)
|
||||
self.model = model
|
||||
self.model.eval()
|
||||
self.ptq = PTQ()
|
||||
self.model = self.ptq.quantize(model)
|
||||
|
||||
# load model
|
||||
params_path = self.args.checkpoint_path + ".pdparams"
|
||||
model_dict = paddle.load(params_path)
|
||||
self.model.set_state_dict(model_dict)
|
||||
|
||||
def run(self):
|
||||
check(args.audio_file)
|
||||
|
||||
with paddle.no_grad():
|
||||
# read
|
||||
audio, sample_rate = soundfile.read(
|
||||
self.audio_file, dtype="int16", always_2d=True)
|
||||
audio = audio[:, 0]
|
||||
logger.info(f"audio shape: {audio.shape}")
|
||||
|
||||
# fbank
|
||||
feat = self.preprocessing(audio, **self.preprocess_args)
|
||||
logger.info(f"feat shape: {feat.shape}")
|
||||
|
||||
ilen = paddle.to_tensor(feat.shape[0])
|
||||
xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
|
||||
decode_config = self.config.decode
|
||||
logger.info(f"decode cfg: {decode_config}")
|
||||
reverse_weight = getattr(decode_config, 'reverse_weight', 0.0)
|
||||
result_transcripts = self.model.decode(
|
||||
xs,
|
||||
ilen,
|
||||
text_feature=self.text_feature,
|
||||
decoding_method=decode_config.decoding_method,
|
||||
beam_size=decode_config.beam_size,
|
||||
ctc_weight=decode_config.ctc_weight,
|
||||
decoding_chunk_size=decode_config.decoding_chunk_size,
|
||||
num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
|
||||
simulate_streaming=decode_config.simulate_streaming,
|
||||
reverse_weight=reverse_weight)
|
||||
rsl = result_transcripts[0][0]
|
||||
utt = Path(self.audio_file).name
|
||||
logger.info(f"hyp: {utt} {rsl}")
|
||||
# print(self.model)
|
||||
# print(self.model.forward_encoder_chunk)
|
||||
|
||||
logger.info("-------------start quant ----------------------")
|
||||
batch_size = 1
|
||||
feat_dim = 80
|
||||
model_size = 512
|
||||
num_left_chunks = -1
|
||||
reverse_weight = 0.3
|
||||
logger.info(
|
||||
f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}, reverse_weight {reverse_weight}"
|
||||
)
|
||||
|
||||
# ######################## self.model.forward_encoder_chunk ############
|
||||
# input_spec = [
|
||||
# # (T,), int16
|
||||
# paddle.static.InputSpec(shape=[None], dtype='int16'),
|
||||
# ]
|
||||
# self.model.forward_feature = paddle.jit.to_static(
|
||||
# self.model.forward_feature, input_spec=input_spec)
|
||||
|
||||
######################### self.model.forward_encoder_chunk ############
|
||||
input_spec = [
|
||||
# xs, (B, T, D)
|
||||
paddle.static.InputSpec(
|
||||
shape=[batch_size, None, feat_dim], dtype='float32'),
|
||||
# offset, int, but need be tensor
|
||||
paddle.static.InputSpec(shape=[1], dtype='int32'),
|
||||
# required_cache_size, int
|
||||
num_left_chunks,
|
||||
# att_cache
|
||||
paddle.static.InputSpec(
|
||||
shape=[None, None, None, None], dtype='float32'),
|
||||
# cnn_cache
|
||||
paddle.static.InputSpec(
|
||||
shape=[None, None, None, None], dtype='float32')
|
||||
]
|
||||
self.model.forward_encoder_chunk = paddle.jit.to_static(
|
||||
self.model.forward_encoder_chunk, input_spec=input_spec)
|
||||
|
||||
######################### self.model.ctc_activation ########################
|
||||
input_spec = [
|
||||
# encoder_out, (B,T,D)
|
||||
paddle.static.InputSpec(
|
||||
shape=[batch_size, None, model_size], dtype='float32')
|
||||
]
|
||||
self.model.ctc_activation = paddle.jit.to_static(
|
||||
self.model.ctc_activation, input_spec=input_spec)
|
||||
|
||||
######################### self.model.forward_attention_decoder ########################
|
||||
input_spec = [
|
||||
# hyps, (B, U)
|
||||
paddle.static.InputSpec(shape=[None, None], dtype='int64'),
|
||||
# hyps_lens, (B,)
|
||||
paddle.static.InputSpec(shape=[None], dtype='int64'),
|
||||
# encoder_out, (B,T,D)
|
||||
paddle.static.InputSpec(
|
||||
shape=[batch_size, None, model_size], dtype='float32'),
|
||||
reverse_weight
|
||||
]
|
||||
self.model.forward_attention_decoder = paddle.jit.to_static(
|
||||
self.model.forward_attention_decoder, input_spec=input_spec)
|
||||
################################################################################
|
||||
|
||||
# jit save
|
||||
logger.info(f"export save: {self.args.export_path}")
|
||||
config = {
|
||||
'is_static': True,
|
||||
'combine_params': True,
|
||||
'skip_forward': True
|
||||
}
|
||||
self.ptq.save_quantized_model(self.model, self.args.export_path)
|
||||
# paddle.jit.save(
|
||||
# self.model,
|
||||
# self.args.export_path,
|
||||
# combine_params=True,
|
||||
# skip_forward=True)
|
||||
|
||||
|
||||
def check(audio_file):
|
||||
if not os.path.isfile(audio_file):
|
||||
print("Please input the right audio file path")
|
||||
sys.exit(-1)
|
||||
|
||||
logger.info("checking the audio file format......")
|
||||
try:
|
||||
sig, sample_rate = soundfile.read(audio_file)
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
logger.error(
|
||||
"can not open the wav file, please check the audio file format")
|
||||
sys.exit(-1)
|
||||
logger.info("The sample rate is %d" % sample_rate)
|
||||
assert (sample_rate == 16000)
|
||||
logger.info("The audio file format is right")
|
||||
|
||||
|
||||
def main(config, args):
|
||||
U2Infer(config, args).run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
# save asr result to
|
||||
parser.add_argument(
|
||||
"--result_file", type=str, help="path of save the asr result")
|
||||
parser.add_argument(
|
||||
"--audio_file", type=str, help="path of the input audio file")
|
||||
parser.add_argument(
|
||||
"--export_path",
|
||||
type=str,
|
||||
default='export',
|
||||
help="path of the input audio file")
|
||||
args = parser.parse_args()
|
||||
|
||||
config = CfgNode(new_allowed=True)
|
||||
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.decode_cfg:
|
||||
decode_confs = CfgNode(new_allowed=True)
|
||||
decode_confs.merge_from_file(args.decode_cfg)
|
||||
config.decode = decode_confs
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
main(config, args)
|
@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
@ -0,0 +1,64 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Evaluation for wav2vec2.0 model."""
|
||||
import cProfile
|
||||
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
|
||||
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||
from paddlespeech.s2t.utils.utility import print_arguments
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Tester(config, args)
|
||||
with exp.eval():
|
||||
exp.setup()
|
||||
exp.run_test()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
# save asr result to
|
||||
parser.add_argument(
|
||||
'--dict-path', type=str, default=None, help='dict path.')
|
||||
parser.add_argument(
|
||||
"--result_file", type=str, help="path of save the asr result")
|
||||
args = parser.parse_args()
|
||||
print_arguments(args, globals())
|
||||
|
||||
# https://yaml.org/type/float.html
|
||||
config = CfgNode(new_allowed=True)
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.decode_cfg:
|
||||
decode_confs = CfgNode(new_allowed=True)
|
||||
decode_confs.merge_from_file(args.decode_cfg)
|
||||
config.decode = decode_confs
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
if args.dump_config:
|
||||
with open(args.dump_config, 'w') as f:
|
||||
print(config, file=f)
|
||||
|
||||
# Setting for profiling
|
||||
pr = cProfile.Profile()
|
||||
pr.runcall(main, config, args)
|
||||
pr.dump_stats('test.profile')
|
@ -0,0 +1,118 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Evaluation for wav2vec2.0 model."""
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import paddle
|
||||
import soundfile
|
||||
from yacs.config import CfgNode
|
||||
|
||||
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
|
||||
from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
|
||||
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||
from paddlespeech.s2t.utils.log import Log
|
||||
from paddlespeech.s2t.utils.utility import UpdateConfig
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
|
||||
class Wav2vec2Infer():
|
||||
def __init__(self, config, args):
|
||||
self.args = args
|
||||
self.config = config
|
||||
self.audio_file = args.audio_file
|
||||
|
||||
self.text_feature = TextFeaturizer(
|
||||
unit_type=config.unit_type, vocab=config.vocab_filepath)
|
||||
paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
|
||||
|
||||
# model
|
||||
model_conf = config
|
||||
with UpdateConfig(model_conf):
|
||||
model_conf.output_dim = self.text_feature.vocab_size
|
||||
model = Wav2vec2ASR.from_config(model_conf)
|
||||
self.model = model
|
||||
self.model.eval()
|
||||
|
||||
# load model
|
||||
params_path = self.args.checkpoint_path + ".pdparams"
|
||||
model_dict = paddle.load(params_path)
|
||||
self.model.set_state_dict(model_dict)
|
||||
|
||||
def run(self):
|
||||
check(args.audio_file)
|
||||
|
||||
with paddle.no_grad():
|
||||
# read
|
||||
audio, _ = soundfile.read(
|
||||
self.audio_file, dtype="int16", always_2d=True)
|
||||
logger.info(f"audio shape: {audio.shape}")
|
||||
|
||||
xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
|
||||
decode_config = self.config.decode
|
||||
result_transcripts, result_tokenids = self.model.decode(
|
||||
xs,
|
||||
text_feature=self.text_feature,
|
||||
decoding_method=decode_config.decoding_method,
|
||||
beam_size=decode_config.beam_size)
|
||||
rsl = result_transcripts[0]
|
||||
utt = Path(self.audio_file).name
|
||||
logger.info(f"hyp: {utt} {rsl}")
|
||||
return rsl
|
||||
|
||||
|
||||
def check(audio_file):
|
||||
if not os.path.isfile(audio_file):
|
||||
print("Please input the right audio file path")
|
||||
sys.exit(-1)
|
||||
|
||||
logger.info("checking the audio file format......")
|
||||
try:
|
||||
sig, sample_rate = soundfile.read(audio_file)
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
logger.error(
|
||||
"can not open the wav file, please check the audio file format")
|
||||
sys.exit(-1)
|
||||
logger.info("The sample rate is %d" % sample_rate)
|
||||
assert (sample_rate == 16000)
|
||||
logger.info("The audio file format is right")
|
||||
|
||||
|
||||
def main(config, args):
|
||||
Wav2vec2Infer(config, args).run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
# save asr result to
|
||||
parser.add_argument(
|
||||
"--result_file", type=str, help="path of save the asr result")
|
||||
parser.add_argument(
|
||||
"--audio_file", type=str, help="path of the input audio file")
|
||||
args = parser.parse_args()
|
||||
|
||||
config = CfgNode(new_allowed=True)
|
||||
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.decode_cfg:
|
||||
decode_confs = CfgNode(new_allowed=True)
|
||||
decode_confs.merge_from_file(args.decode_cfg)
|
||||
config.decode = decode_confs
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
main(config, args)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue