commit
8e348d66b9
@ -0,0 +1,67 @@
|
|||||||
|
([简体中文](./README_cn.md)|English)
|
||||||
|
|
||||||
|
# Streaming Speech Synthesis Service
|
||||||
|
|
||||||
|
## Introduction
|
||||||
|
This demo is an implementation of starting the streaming speech synthesis service and accessing the service.
|
||||||
|
|
||||||
|
`Server` must be started in the docker, while `Client` does not have to be in the docker.
|
||||||
|
|
||||||
|
**The streaming_tts_serving under the path of this article ($PWD) contains the configuration and code of the model, which needs to be mapped to the docker for use.**
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
### 1. Server
|
||||||
|
#### 1.1 Docker
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker pull registry.baidubce.com/paddlepaddle/fastdeploy_serving_cpu_only:22.09
|
||||||
|
docker run -dit --net=host --name fastdeploy --shm-size="1g" -v $PWD:/models registry.baidubce.com/paddlepaddle/fastdeploy_serving_cpu_only:22.09
|
||||||
|
docker exec -it -u root fastdeploy bash
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 1.2 Installation(inside the docker)
|
||||||
|
```bash
|
||||||
|
apt-get install build-essential python3-dev libssl-dev libffi-dev libxml2 libxml2-dev libxslt1-dev zlib1g-dev libsndfile1 language-pack-zh-hans wget zip
|
||||||
|
pip3 install paddlespeech
|
||||||
|
export LC_ALL="zh_CN.UTF-8"
|
||||||
|
export LANG="zh_CN.UTF-8"
|
||||||
|
export LANGUAGE="zh_CN:zh:en_US:en"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 1.3 Download models(inside the docker)
|
||||||
|
```bash
|
||||||
|
cd /models/streaming_tts_serving/1
|
||||||
|
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip
|
||||||
|
wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_onnx_0.2.0.zip
|
||||||
|
unzip fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip
|
||||||
|
unzip mb_melgan_csmsc_onnx_0.2.0.zip
|
||||||
|
```
|
||||||
|
**For the convenience of users, we recommend that you use the command `docker -v` to map $PWD (streaming_tts_service and the configuration and code of the model contained therein) to the docker path `/models`. You can also use other methods, but regardless of which method you use, the final model directory and structure in the docker are shown in the following figure.**
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="./tree.png" />
|
||||||
|
</p>
|
||||||
|
|
||||||
|
#### 1.4 Start the server(inside the docker)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
fastdeployserver --model-repository=/models --model-control-mode=explicit --load-model=streaming_tts_serving
|
||||||
|
```
|
||||||
|
Arguments:
|
||||||
|
- `model-repository`(required): Path of model storage.
|
||||||
|
- `model-control-mode`(required): The mode of loading the model. At present, you can use 'explicit'.
|
||||||
|
- `load-model`(required): Name of the model to be loaded.
|
||||||
|
- `http-port`(optional): Port for http service. Default: `8000`. This is not used in our example.
|
||||||
|
- `grpc-port`(optional): Port for grpc service. Default: `8001`.
|
||||||
|
- `metrics-port`(optional): Port for metrics service. Default: `8002`. This is not used in our example.
|
||||||
|
|
||||||
|
### 2. Client
|
||||||
|
#### 2.1 Installation
|
||||||
|
```bash
|
||||||
|
pip3 install tritonclient[all]
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2.2 Send request
|
||||||
|
```bash
|
||||||
|
python3 /models/streaming_tts_serving/stream_client.py
|
||||||
|
```
|
@ -0,0 +1,33 @@
|
|||||||
|
name: "streaming_tts_serving"
|
||||||
|
backend: "python"
|
||||||
|
max_batch_size: 0
|
||||||
|
model_transaction_policy {
|
||||||
|
decoupled: True
|
||||||
|
}
|
||||||
|
input [
|
||||||
|
{
|
||||||
|
name: "INPUT_0"
|
||||||
|
data_type: TYPE_STRING
|
||||||
|
dims: [ 1 ]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
output [
|
||||||
|
{
|
||||||
|
name: "OUTPUT_0"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ -1, 1 ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "status"
|
||||||
|
data_type: TYPE_BOOL
|
||||||
|
dims: [ 1 ]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
instance_group [
|
||||||
|
{
|
||||||
|
count: 1
|
||||||
|
kind: KIND_CPU
|
||||||
|
}
|
||||||
|
]
|
@ -0,0 +1,117 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import argparse
|
||||||
|
import queue
|
||||||
|
import sys
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import tritonclient.grpc as grpcclient
|
||||||
|
from tritonclient.utils import *
|
||||||
|
|
||||||
|
FLAGS = None
|
||||||
|
|
||||||
|
|
||||||
|
class UserData:
|
||||||
|
def __init__(self):
|
||||||
|
self._completed_requests = queue.Queue()
|
||||||
|
|
||||||
|
|
||||||
|
# Define the callback function. Note the last two parameters should be
|
||||||
|
# result and error. InferenceServerClient would povide the results of an
|
||||||
|
# inference as grpcclient.InferResult in result. For successful
|
||||||
|
# inference, error will be None, otherwise it will be an object of
|
||||||
|
# tritonclientutils.InferenceServerException holding the error details
|
||||||
|
def callback(user_data, result, error):
|
||||||
|
if error:
|
||||||
|
user_data._completed_requests.put(error)
|
||||||
|
else:
|
||||||
|
user_data._completed_requests.put(result)
|
||||||
|
|
||||||
|
|
||||||
|
def async_stream_send(triton_client, values, request_id, model_name):
|
||||||
|
|
||||||
|
infer_inputs = []
|
||||||
|
outputs = []
|
||||||
|
for idx, data in enumerate(values):
|
||||||
|
data = np.array([data.encode('utf-8')], dtype=np.object_)
|
||||||
|
infer_input = grpcclient.InferInput('INPUT_0', [len(data)], "BYTES")
|
||||||
|
infer_input.set_data_from_numpy(data)
|
||||||
|
infer_inputs.append(infer_input)
|
||||||
|
|
||||||
|
outputs.append(grpcclient.InferRequestedOutput('OUTPUT_0'))
|
||||||
|
# Issue the asynchronous sequence inference.
|
||||||
|
triton_client.async_stream_infer(
|
||||||
|
model_name=model_name,
|
||||||
|
inputs=infer_inputs,
|
||||||
|
outputs=outputs,
|
||||||
|
request_id=request_id)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
'-v',
|
||||||
|
'--verbose',
|
||||||
|
action="store_true",
|
||||||
|
required=False,
|
||||||
|
default=False,
|
||||||
|
help='Enable verbose output')
|
||||||
|
parser.add_argument(
|
||||||
|
'-u',
|
||||||
|
'--url',
|
||||||
|
type=str,
|
||||||
|
required=False,
|
||||||
|
default='localhost:8001',
|
||||||
|
help='Inference server URL and it gRPC port. Default is localhost:8001.')
|
||||||
|
|
||||||
|
FLAGS = parser.parse_args()
|
||||||
|
|
||||||
|
# We use custom "sequence" models which take 1 input
|
||||||
|
# value. The output is the accumulated value of the inputs. See
|
||||||
|
# src/custom/sequence.
|
||||||
|
model_name = "streaming_tts_serving"
|
||||||
|
|
||||||
|
values = ["哈哈哈哈"]
|
||||||
|
|
||||||
|
request_id = "0"
|
||||||
|
|
||||||
|
string_result0_list = []
|
||||||
|
|
||||||
|
user_data = UserData()
|
||||||
|
|
||||||
|
# It is advisable to use client object within with..as clause
|
||||||
|
# when sending streaming requests. This ensures the client
|
||||||
|
# is closed when the block inside with exits.
|
||||||
|
with grpcclient.InferenceServerClient(
|
||||||
|
url=FLAGS.url, verbose=FLAGS.verbose) as triton_client:
|
||||||
|
try:
|
||||||
|
# Establish stream
|
||||||
|
triton_client.start_stream(callback=partial(callback, user_data))
|
||||||
|
# Now send the inference sequences...
|
||||||
|
async_stream_send(triton_client, values, request_id, model_name)
|
||||||
|
except InferenceServerException as error:
|
||||||
|
print(error)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Retrieve results...
|
||||||
|
recv_count = 0
|
||||||
|
result_dict = {}
|
||||||
|
status = True
|
||||||
|
while True:
|
||||||
|
data_item = user_data._completed_requests.get()
|
||||||
|
if type(data_item) == InferenceServerException:
|
||||||
|
raise data_item
|
||||||
|
else:
|
||||||
|
this_id = data_item.get_response().id
|
||||||
|
if this_id not in result_dict.keys():
|
||||||
|
result_dict[this_id] = []
|
||||||
|
result_dict[this_id].append((recv_count, data_item))
|
||||||
|
sub_wav = data_item.as_numpy('OUTPUT_0')
|
||||||
|
status = data_item.as_numpy('status')
|
||||||
|
print('sub_wav = ', sub_wav, "subwav.shape = ", sub_wav.shape)
|
||||||
|
print('status = ', status)
|
||||||
|
if status[0] == 1:
|
||||||
|
break
|
||||||
|
recv_count += 1
|
||||||
|
|
||||||
|
print("PASS: stream_client")
|
After Width: | Height: | Size: 24 KiB |
@ -0,0 +1,44 @@
|
|||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
dataset_type: Ernie
|
||||||
|
train_path: data/iwslt2012_zh/train.txt
|
||||||
|
dev_path: data/iwslt2012_zh/dev.txt
|
||||||
|
test_path: data/iwslt2012_zh/test.txt
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 2
|
||||||
|
data_params:
|
||||||
|
pretrained_token: ernie-3.0-base-zh
|
||||||
|
punc_path: data/iwslt2012_zh/punc_vocab
|
||||||
|
seq_len: 100
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model_type: ErnieLinear
|
||||||
|
model:
|
||||||
|
pretrained_token: ernie-3.0-base-zh
|
||||||
|
num_classes: 4
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer_params:
|
||||||
|
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||||
|
|
||||||
|
scheduler_params:
|
||||||
|
learning_rate: 1.0e-5 # learning rate.
|
||||||
|
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 20
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_snapshots: 10 # max number of snapshots to keep while training
|
||||||
|
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,44 @@
|
|||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
dataset_type: Ernie
|
||||||
|
train_path: data/iwslt2012_zh/train.txt
|
||||||
|
dev_path: data/iwslt2012_zh/dev.txt
|
||||||
|
test_path: data/iwslt2012_zh/test.txt
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 2
|
||||||
|
data_params:
|
||||||
|
pretrained_token: ernie-3.0-medium-zh
|
||||||
|
punc_path: data/iwslt2012_zh/punc_vocab
|
||||||
|
seq_len: 100
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model_type: ErnieLinear
|
||||||
|
model:
|
||||||
|
pretrained_token: ernie-3.0-medium-zh
|
||||||
|
num_classes: 4
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer_params:
|
||||||
|
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||||
|
|
||||||
|
scheduler_params:
|
||||||
|
learning_rate: 1.0e-5 # learning rate.
|
||||||
|
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 20
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_snapshots: 10 # max number of snapshots to keep while training
|
||||||
|
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,44 @@
|
|||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
dataset_type: Ernie
|
||||||
|
train_path: data/iwslt2012_zh/train.txt
|
||||||
|
dev_path: data/iwslt2012_zh/dev.txt
|
||||||
|
test_path: data/iwslt2012_zh/test.txt
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 2
|
||||||
|
data_params:
|
||||||
|
pretrained_token: ernie-3.0-mini-zh
|
||||||
|
punc_path: data/iwslt2012_zh/punc_vocab
|
||||||
|
seq_len: 100
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model_type: ErnieLinear
|
||||||
|
model:
|
||||||
|
pretrained_token: ernie-3.0-mini-zh
|
||||||
|
num_classes: 4
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer_params:
|
||||||
|
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||||
|
|
||||||
|
scheduler_params:
|
||||||
|
learning_rate: 1.0e-5 # learning rate.
|
||||||
|
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 20
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_snapshots: 10 # max number of snapshots to keep while training
|
||||||
|
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,44 @@
|
|||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
dataset_type: Ernie
|
||||||
|
train_path: data/iwslt2012_zh/train.txt
|
||||||
|
dev_path: data/iwslt2012_zh/dev.txt
|
||||||
|
test_path: data/iwslt2012_zh/test.txt
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 2
|
||||||
|
data_params:
|
||||||
|
pretrained_token: ernie-3.0-nano-zh
|
||||||
|
punc_path: data/iwslt2012_zh/punc_vocab
|
||||||
|
seq_len: 100
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model_type: ErnieLinear
|
||||||
|
model:
|
||||||
|
pretrained_token: ernie-3.0-nano-zh
|
||||||
|
num_classes: 4
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer_params:
|
||||||
|
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||||
|
|
||||||
|
scheduler_params:
|
||||||
|
learning_rate: 1.0e-5 # learning rate.
|
||||||
|
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 20
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_snapshots: 10 # max number of snapshots to keep while training
|
||||||
|
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,44 @@
|
|||||||
|
###########################################################
|
||||||
|
# DATA SETTING #
|
||||||
|
###########################################################
|
||||||
|
dataset_type: Ernie
|
||||||
|
train_path: data/iwslt2012_zh/train.txt
|
||||||
|
dev_path: data/iwslt2012_zh/dev.txt
|
||||||
|
test_path: data/iwslt2012_zh/test.txt
|
||||||
|
batch_size: 64
|
||||||
|
num_workers: 2
|
||||||
|
data_params:
|
||||||
|
pretrained_token: ernie-tiny
|
||||||
|
punc_path: data/iwslt2012_zh/punc_vocab
|
||||||
|
seq_len: 100
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# MODEL SETTING #
|
||||||
|
###########################################################
|
||||||
|
model_type: ErnieLinear
|
||||||
|
model:
|
||||||
|
pretrained_token: ernie-tiny
|
||||||
|
num_classes: 4
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OPTIMIZER SETTING #
|
||||||
|
###########################################################
|
||||||
|
optimizer_params:
|
||||||
|
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||||
|
|
||||||
|
scheduler_params:
|
||||||
|
learning_rate: 1.0e-5 # learning rate.
|
||||||
|
gamma: 0.9999 # scheduler gamma must between(0.0, 1.0) and closer to 1.0 is better.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# TRAINING SETTING #
|
||||||
|
###########################################################
|
||||||
|
max_epoch: 20
|
||||||
|
num_snapshots: 5
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_snapshots: 10 # max number of snapshots to keep while training
|
||||||
|
seed: 42 # random seed for paddle, random, and np.random
|
@ -0,0 +1,8 @@
|
|||||||
|
# LibriSpeech
|
||||||
|
|
||||||
|
## Wav2VecASR
|
||||||
|
train: Epoch 1, 1*V100-32G, batchsize:10
|
||||||
|
|
||||||
|
| Model | Params | Config | Augmentation| Test set | Decode method | WER |
|
||||||
|
| --- | --- | --- | --- | --- | --- | --- |
|
||||||
|
| wav2vec2ASR | 302.86 M | conf/wav2vec2ASR.yaml | spec_aug | test-clean | greedy search | 0.018887 |
|
@ -0,0 +1,4 @@
|
|||||||
|
process:
|
||||||
|
# use raw audio
|
||||||
|
- type: wav_process
|
||||||
|
dither: 0.0
|
@ -0,0 +1,4 @@
|
|||||||
|
decode_batch_size: 1
|
||||||
|
error_rate_type: wer
|
||||||
|
decoding_method: ctc_greedy_search # 'ctc_greedy_search', 'ctc_prefix_beam_search'
|
||||||
|
beam_size: 10
|
@ -0,0 +1,120 @@
|
|||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
freeze_wav2vec2: True
|
||||||
|
normalize_wav: True
|
||||||
|
output_norm: True
|
||||||
|
dnn_blocks: 2
|
||||||
|
dnn_neurons: 1024
|
||||||
|
blank_id: 0
|
||||||
|
ctc_dropout_rate: 0.0
|
||||||
|
wav2vec2_params_path: "exp/wav2vec2/wav2vec2-large-960h-lv60-self.pdparams"
|
||||||
|
|
||||||
|
############################################
|
||||||
|
# Wav2Vec2.0 #
|
||||||
|
############################################
|
||||||
|
vocab_size: 32
|
||||||
|
hidden_size: 1024
|
||||||
|
num_hidden_layers: 24
|
||||||
|
num_attention_heads: 16
|
||||||
|
intermediate_size: 4096
|
||||||
|
hidden_act: "gelu"
|
||||||
|
hidden_dropout: 0.1
|
||||||
|
activation_dropout: 0.1
|
||||||
|
attention_dropout: 0.1
|
||||||
|
feat_proj_dropout: 0.1
|
||||||
|
feat_quantizer_dropout: 0.0
|
||||||
|
final_dropout: 0.1
|
||||||
|
layerdrop: 0.1
|
||||||
|
initializer_range: 0.02
|
||||||
|
layer_norm_eps: 1e-5
|
||||||
|
feat_extract_norm: "layer"
|
||||||
|
feat_extract_activation: "gelu"
|
||||||
|
conv_dim: [512, 512, 512, 512, 512, 512, 512]
|
||||||
|
conv_stride: [5, 2, 2, 2, 2, 2, 2]
|
||||||
|
conv_kernel: [10, 3, 3, 3, 3, 2, 2]
|
||||||
|
conv_bias: True
|
||||||
|
num_conv_pos_embeddings: 128
|
||||||
|
num_conv_pos_embedding_groups: 16
|
||||||
|
do_stable_layer_norm: True
|
||||||
|
apply_spec_augment: False
|
||||||
|
mask_time_prob: 0.05
|
||||||
|
mask_time_length: 10
|
||||||
|
mask_time_min_masks: 2
|
||||||
|
mask_feature_prob: 0.0
|
||||||
|
mask_feature_length: 10
|
||||||
|
mask_feature_min_masks: 0
|
||||||
|
num_codevectors_per_group: 320
|
||||||
|
num_codevector_groups: 2
|
||||||
|
contrastive_logits_temperature: 0.1
|
||||||
|
num_negatives: 100
|
||||||
|
codevector_dim: 256
|
||||||
|
proj_codevector_dim: 256
|
||||||
|
diversity_loss_weight: 0.1
|
||||||
|
ctc_loss_reduction: "sum"
|
||||||
|
ctc_zero_infinity: False
|
||||||
|
use_weighted_layer_sum: False
|
||||||
|
pad_token_id: 0
|
||||||
|
bos_token_id: 1
|
||||||
|
eos_token_id: 2
|
||||||
|
add_adapter: False
|
||||||
|
adapter_kernel_size: 3
|
||||||
|
adapter_stride: 2
|
||||||
|
num_adapter_layers: 3
|
||||||
|
output_hidden_size: None
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/manifest.train
|
||||||
|
dev_manifest: data/manifest.dev
|
||||||
|
test_manifest: data/manifest.test-clean
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
vocab_filepath: data/lang_char/vocab.txt
|
||||||
|
unit_type: 'char'
|
||||||
|
mean_std_filepath: ""
|
||||||
|
preprocess_config: conf/preprocess.yaml
|
||||||
|
sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for 'other' epochs
|
||||||
|
batch_size: 10 # Different batch_size may cause large differences in results
|
||||||
|
maxlen_in: 51200000000 # if input length > maxlen-in batchsize is automatically reduced
|
||||||
|
maxlen_out: 1500000 # if output length > maxlen-out batchsize is automatically reduced
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
dist_sampler: True
|
||||||
|
shortest_first: True
|
||||||
|
return_lens_rate: True
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 1
|
||||||
|
accum_grad: 1
|
||||||
|
global_grad_clip: 3.0
|
||||||
|
model_optim: adadelta
|
||||||
|
model_optim_conf:
|
||||||
|
lr: 0.9
|
||||||
|
epsilon: 1.0e-6
|
||||||
|
rho: 0.95
|
||||||
|
scheduler: constantlr
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 1
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
||||||
|
augment: True
|
||||||
|
|
||||||
|
|
@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
stage=-1
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
unit_type=char
|
||||||
|
dict_dir=data/lang_char
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh
|
||||||
|
|
||||||
|
mkdir -p data
|
||||||
|
mkdir -p ${dict_dir}
|
||||||
|
TARGET_DIR=${MAIN_ROOT}/dataset
|
||||||
|
mkdir -p ${TARGET_DIR}
|
||||||
|
|
||||||
|
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
|
||||||
|
# download data, generate manifests
|
||||||
|
python3 ${TARGET_DIR}/librispeech/librispeech.py \
|
||||||
|
--manifest_prefix="data/manifest" \
|
||||||
|
--target_dir="${TARGET_DIR}/librispeech" \
|
||||||
|
--full_download="True"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Prepare LibriSpeech failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||||
|
mv data/manifest.${set} data/manifest.${set}.raw
|
||||||
|
done
|
||||||
|
|
||||||
|
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
|
||||||
|
for set in train-clean-100 train-clean-360 train-other-500; do
|
||||||
|
cat data/manifest.${set}.raw >> data/manifest.train.raw
|
||||||
|
done
|
||||||
|
|
||||||
|
for set in dev-clean dev-other; do
|
||||||
|
cat data/manifest.${set}.raw >> data/manifest.dev.raw
|
||||||
|
done
|
||||||
|
|
||||||
|
for set in test-clean test-other; do
|
||||||
|
cat data/manifest.${set}.raw >> data/manifest.test.raw
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# compute mean and stddev for normalizer
|
||||||
|
num_workers=$(nproc)
|
||||||
|
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
|
||||||
|
--manifest_path="data/manifest.train.raw" \
|
||||||
|
--num_samples=2000 \
|
||||||
|
--spectrum_type="fbank" \
|
||||||
|
--feat_dim=161 \
|
||||||
|
--delta_delta=false \
|
||||||
|
--sample_rate=16000 \
|
||||||
|
--stride_ms=10 \
|
||||||
|
--window_ms=25 \
|
||||||
|
--use_dB_normalization=False \
|
||||||
|
--num_workers=${num_workers} \
|
||||||
|
--output_path="data/mean_std.json"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Compute mean and stddev failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# build vocabulary
|
||||||
|
python3 ${MAIN_ROOT}/utils/build_vocab.py \
|
||||||
|
--unit_type ${unit_type} \
|
||||||
|
--count_threshold=0 \
|
||||||
|
--vocab_path="${dict_dir}/vocab.txt" \
|
||||||
|
--manifest_paths="data/manifest.train.raw"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Build vocabulary failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# format manifest with tokenids, vocab size
|
||||||
|
for set in train dev test dev-clean dev-other test-clean test-other; do
|
||||||
|
{
|
||||||
|
python3 ${MAIN_ROOT}/utils/format_data.py \
|
||||||
|
--cmvn_path "data/mean_std.json" \
|
||||||
|
--unit_type ${unit_type} \
|
||||||
|
--vocab_path="${dict_dir}/vocab.txt" \
|
||||||
|
--manifest_path="data/manifest.${set}.raw" \
|
||||||
|
--output_path="data/manifest.${set}"
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Formt mnaifest.${set} failed. Terminated."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}&
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "LibriSpeech Data preparation done."
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
mkdir -p exp/wav2vec2
|
||||||
|
echo "Pretrained wav2vec2 model download"
|
||||||
|
wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/wav2vec2-large-960h-lv60-self.pdparams
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,84 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
expdir=exp
|
||||||
|
datadir=data
|
||||||
|
|
||||||
|
train_set=train_960
|
||||||
|
recog_set="test-clean test-other dev-clean dev-other"
|
||||||
|
recog_set="test-clean"
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
decode_config_path=$2
|
||||||
|
ckpt_prefix=$3
|
||||||
|
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
# download language model
|
||||||
|
#bash local/download_lm_en.sh
|
||||||
|
#if [ $? -ne 0 ]; then
|
||||||
|
# exit 1
|
||||||
|
#fi
|
||||||
|
|
||||||
|
python3 utils/format_rsl.py \
|
||||||
|
--origin_ref data/manifest.test-clean.raw \
|
||||||
|
--trans_ref data/manifest.test-clean.text
|
||||||
|
|
||||||
|
|
||||||
|
for type in ctc_greedy_search; do
|
||||||
|
echo "decoding ${type}"
|
||||||
|
batch_size=16
|
||||||
|
python3 -u ${BIN_DIR}/test.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--decode_cfg ${decode_config_path} \
|
||||||
|
--result_file ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--checkpoint_path ${ckpt_prefix} \
|
||||||
|
--opts decode.decoding_method ${type} \
|
||||||
|
--opts decode.decode_batch_size ${batch_size}
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in evaluation!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
python3 utils/format_rsl.py \
|
||||||
|
--origin_hyp ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
|
||||||
|
|
||||||
|
python3 utils/compute-wer.py --char=1 --v=1 \
|
||||||
|
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
|
||||||
|
echo "decoding ${type} done."
|
||||||
|
done
|
||||||
|
|
||||||
|
for type in ctc_prefix_beam_search; do
|
||||||
|
echo "decoding ${type}"
|
||||||
|
batch_size=1
|
||||||
|
python3 -u ${BIN_DIR}/test.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--decode_cfg ${decode_config_path} \
|
||||||
|
--result_file ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--checkpoint_path ${ckpt_prefix} \
|
||||||
|
--opts decode.decoding_method ${type} \
|
||||||
|
--opts decode.decode_batch_size ${batch_size}
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in evaluation!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
python3 utils/format_rsl.py \
|
||||||
|
--origin_hyp ${ckpt_prefix}.${type}.rsl \
|
||||||
|
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
|
||||||
|
|
||||||
|
python3 utils/compute-wer.py --char=1 --v=1 \
|
||||||
|
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
|
||||||
|
echo "decoding ${type} done."
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Finished"
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,58 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# != 4 ];then
|
||||||
|
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
decode_config_path=$2
|
||||||
|
ckpt_prefix=$3
|
||||||
|
audio_file=$4
|
||||||
|
|
||||||
|
mkdir -p data
|
||||||
|
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ${audio_file} ]; then
|
||||||
|
echo "Plase input the right audio_file path"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
chunk_mode=false
|
||||||
|
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
|
||||||
|
chunk_mode=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# download language model
|
||||||
|
#bash local/download_lm_ch.sh
|
||||||
|
#if [ $? -ne 0 ]; then
|
||||||
|
# exit 1
|
||||||
|
#fi
|
||||||
|
|
||||||
|
for type in ctc_greedy_search; do
|
||||||
|
echo "decoding ${type}"
|
||||||
|
batch_size=1
|
||||||
|
output_dir=${ckpt_prefix}
|
||||||
|
mkdir -p ${output_dir}
|
||||||
|
python3 -u ${BIN_DIR}/test_wav.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--decode_cfg ${decode_config_path} \
|
||||||
|
--result_file ${output_dir}/${type}.rsl \
|
||||||
|
--checkpoint_path ${ckpt_prefix} \
|
||||||
|
--opts decode.decoding_method ${type} \
|
||||||
|
--opts decode.decode_batch_size ${batch_size} \
|
||||||
|
--audio_file ${audio_file}
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in evaluation!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
exit 0
|
@ -0,0 +1,55 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -lt 2 ] && [ $# -gt 3 ];then
|
||||||
|
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
ckpt_name=$2
|
||||||
|
ips=$3
|
||||||
|
|
||||||
|
if [ ! $ips ];then
|
||||||
|
ips_config=
|
||||||
|
else
|
||||||
|
ips_config="--ips="${ips}
|
||||||
|
fi
|
||||||
|
|
||||||
|
mkdir -p exp
|
||||||
|
|
||||||
|
# seed may break model convergence
|
||||||
|
seed=1998
|
||||||
|
if [ ${seed} != 0 ]; then
|
||||||
|
export FLAGS_cudnn_deterministic=True
|
||||||
|
fi
|
||||||
|
|
||||||
|
# export FLAGS_cudnn_exhaustive_search=true
|
||||||
|
# export FLAGS_conv_workspace_size_limit=4000
|
||||||
|
export FLAGS_allocator_strategy=naive_best_fit
|
||||||
|
if [ ${ngpu} == 0 ]; then
|
||||||
|
python3 -u ${BIN_DIR}/train.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--output exp/${ckpt_name} \
|
||||||
|
--seed ${seed}
|
||||||
|
else
|
||||||
|
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--output exp/${ckpt_name} \
|
||||||
|
--seed ${seed}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${seed} != 0 ]; then
|
||||||
|
unset FLAGS_cudnn_deterministic
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in training!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
@ -0,0 +1,15 @@
|
|||||||
|
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||||
|
|
||||||
|
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
|
||||||
|
export LC_ALL=C
|
||||||
|
|
||||||
|
export PYTHONDONTWRITEBYTECODE=1
|
||||||
|
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||||
|
export PYTHONIOENCODING=UTF-8
|
||||||
|
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
|
||||||
|
|
||||||
|
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
|
||||||
|
|
||||||
|
|
||||||
|
MODEL=wav2vec2
|
||||||
|
export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
|
@ -0,0 +1,47 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
. ./path.sh || exit 1;
|
||||||
|
. ./cmd.sh || exit 1;
|
||||||
|
|
||||||
|
gpus=0
|
||||||
|
stage=0
|
||||||
|
stop_stage=0
|
||||||
|
conf_path=conf/wav2vec2ASR.yaml
|
||||||
|
ips= #xx.xx.xx.xx,xx.xx.xx.xx
|
||||||
|
decode_conf_path=conf/tuning/decode.yaml
|
||||||
|
avg_num=1
|
||||||
|
dict_path=data/lang_char/vocab.txt
|
||||||
|
|
||||||
|
. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
|
||||||
|
|
||||||
|
audio_file=data/demo_002_en.wav
|
||||||
|
|
||||||
|
avg_ckpt=avg_${avg_num}
|
||||||
|
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||||
|
echo "checkpoint name ${ckpt}"
|
||||||
|
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
# prepare data
|
||||||
|
bash ./local/data.sh || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
# train model, all `ckpt` under `exp` dir
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
# avg n best model
|
||||||
|
avg.sh best exp/${ckpt}/checkpoints ${avg_num}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
# greedy search decoder
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
# test a single .wav file
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus} ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
|
||||||
|
fi
|
@ -0,0 +1 @@
|
|||||||
|
../../../utils
|
@ -0,0 +1,110 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
source path.sh
|
||||||
|
|
||||||
|
|
||||||
|
input_dir=./input/SSB0005_mini
|
||||||
|
newdir_name="newdir"
|
||||||
|
new_dir=${input_dir}/${newdir_name}
|
||||||
|
pretrained_model_dir=./pretrained_models/fastspeech2_mix_ckpt_1.2.0
|
||||||
|
mfa_tools=./tools
|
||||||
|
mfa_dir=./mfa_result
|
||||||
|
dump_dir=./dump
|
||||||
|
output_dir=./exp/default
|
||||||
|
lang=zh
|
||||||
|
ngpu=1
|
||||||
|
finetune_config=./conf/finetune.yaml
|
||||||
|
replace_spkid=174 # csmsc: 174, ljspeech: 175, aishell3: 0~173, vctk: 176
|
||||||
|
|
||||||
|
ckpt=snapshot_iter_99300
|
||||||
|
|
||||||
|
gpus=1
|
||||||
|
CUDA_VISIBLE_DEVICES=${gpus}
|
||||||
|
stage=0
|
||||||
|
stop_stage=100
|
||||||
|
|
||||||
|
|
||||||
|
# with the following command, you can choose the stage range you want to run
|
||||||
|
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||||
|
# this can not be mixed use with `$1`, `$2` ...
|
||||||
|
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||||
|
|
||||||
|
# check oov
|
||||||
|
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||||
|
echo "check oov"
|
||||||
|
python3 local/check_oov.py \
|
||||||
|
--input_dir=${input_dir} \
|
||||||
|
--pretrained_model_dir=${pretrained_model_dir} \
|
||||||
|
--newdir_name=${newdir_name} \
|
||||||
|
--lang=${lang}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# get mfa result
|
||||||
|
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||||
|
echo "get mfa result"
|
||||||
|
python3 local/get_mfa_result.py \
|
||||||
|
--input_dir=${new_dir} \
|
||||||
|
--mfa_dir=${mfa_dir} \
|
||||||
|
--lang=${lang}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# generate durations.txt
|
||||||
|
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||||
|
echo "generate durations.txt"
|
||||||
|
python3 local/generate_duration.py \
|
||||||
|
--mfa_dir=${mfa_dir}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# extract feature
|
||||||
|
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||||
|
echo "extract feature"
|
||||||
|
python3 local/extract_feature.py \
|
||||||
|
--duration_file="./durations.txt" \
|
||||||
|
--input_dir=${new_dir} \
|
||||||
|
--dump_dir=${dump_dir} \
|
||||||
|
--pretrained_model_dir=${pretrained_model_dir} \
|
||||||
|
--replace_spkid=$replace_spkid
|
||||||
|
|
||||||
|
fi
|
||||||
|
|
||||||
|
# create finetune env
|
||||||
|
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||||
|
echo "create finetune env"
|
||||||
|
python3 local/prepare_env.py \
|
||||||
|
--pretrained_model_dir=${pretrained_model_dir} \
|
||||||
|
--output_dir=${output_dir}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# finetune
|
||||||
|
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
|
||||||
|
echo "finetune..."
|
||||||
|
python3 local/finetune.py \
|
||||||
|
--pretrained_model_dir=${pretrained_model_dir} \
|
||||||
|
--dump_dir=${dump_dir} \
|
||||||
|
--output_dir=${output_dir} \
|
||||||
|
--ngpu=${ngpu} \
|
||||||
|
--epoch=100 \
|
||||||
|
--finetune_config=${finetune_config}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# synthesize e2e
|
||||||
|
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
|
||||||
|
echo "in hifigan syn_e2e"
|
||||||
|
python3 ${BIN_DIR}/../synthesize_e2e.py \
|
||||||
|
--am=fastspeech2_aishell3 \
|
||||||
|
--am_config=${pretrained_model_dir}/default.yaml \
|
||||||
|
--am_ckpt=${output_dir}/checkpoints/${ckpt}.pdz \
|
||||||
|
--am_stat=${pretrained_model_dir}/speech_stats.npy \
|
||||||
|
--voc=hifigan_aishell3 \
|
||||||
|
--voc_config=pretrained_models/hifigan_aishell3_ckpt_0.2.0/default.yaml \
|
||||||
|
--voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
|
||||||
|
--voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
|
||||||
|
--lang=mix \
|
||||||
|
--text=${BIN_DIR}/../sentences_mix.txt \
|
||||||
|
--output_dir=./test_e2e/ \
|
||||||
|
--phones_dict=${dump_dir}/phone_id_map.txt \
|
||||||
|
--speaker_dict=${dump_dir}/speaker_id_map.txt \
|
||||||
|
--spk_id=$replace_spkid
|
||||||
|
fi
|
||||||
|
|
@ -0,0 +1,101 @@
|
|||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file:
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: conformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 512 # dimension of attention
|
||||||
|
attention_heads: 8
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.0
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: True
|
||||||
|
use_cnn_module: True
|
||||||
|
cnn_module_kernel: 15
|
||||||
|
activation_type: swish
|
||||||
|
pos_enc_layer_type: rel_pos
|
||||||
|
selfattention_layer_type: rel_selfattn
|
||||||
|
causal: true
|
||||||
|
use_dynamic_chunk: true
|
||||||
|
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||||
|
use_dynamic_left_chunk: false
|
||||||
|
# decoder related
|
||||||
|
decoder: transformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 8
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 6
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.0
|
||||||
|
src_attention_dropout_rate: 0.0
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
ctc_weight: 0.3
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
reverse_weight: 0.0 # unidecoder
|
||||||
|
length_normalized_loss: false
|
||||||
|
init_type: 'kaiming_uniform'
|
||||||
|
|
||||||
|
# https://yaml.org/type/float.html
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/train_l/data.list
|
||||||
|
dev_manifest: data/dev/data.list
|
||||||
|
test_manifest: data/test_meeting/data.list
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
use_streaming_data: True
|
||||||
|
unit_type: 'char'
|
||||||
|
vocab_filepath: data/lang_char/vocab.txt
|
||||||
|
preprocess_config: conf/preprocess.yaml
|
||||||
|
spm_model_prefix: ''
|
||||||
|
feat_dim: 80
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
batch_size: 32
|
||||||
|
do_filter: True
|
||||||
|
maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minlen_in: 10
|
||||||
|
minlen_out: 0
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 26
|
||||||
|
accum_grad: 32
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
dist_sampler: True
|
||||||
|
log_interval: 1
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.001
|
||||||
|
weight_decay: 1.0e-6
|
||||||
|
scheduler: warmuplr
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 5000
|
||||||
|
lr_decay: 1.0
|
@ -0,0 +1,100 @@
|
|||||||
|
############################################
|
||||||
|
# Network Architecture #
|
||||||
|
############################################
|
||||||
|
cmvn_file:
|
||||||
|
cmvn_file_type: "json"
|
||||||
|
# encoder related
|
||||||
|
encoder: conformer
|
||||||
|
encoder_conf:
|
||||||
|
output_size: 512 # dimension of attention
|
||||||
|
attention_heads: 8
|
||||||
|
linear_units: 2048 # the number of units of position-wise feed forward
|
||||||
|
num_blocks: 12 # the number of encoder blocks
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
attention_dropout_rate: 0.1
|
||||||
|
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
|
||||||
|
normalize_before: True
|
||||||
|
use_cnn_module: True
|
||||||
|
cnn_module_kernel: 15
|
||||||
|
activation_type: swish
|
||||||
|
pos_enc_layer_type: rel_pos
|
||||||
|
selfattention_layer_type: rel_selfattn
|
||||||
|
causal: true
|
||||||
|
use_dynamic_chunk: true
|
||||||
|
cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
|
||||||
|
use_dynamic_left_chunk: false
|
||||||
|
# decoder related
|
||||||
|
decoder: bitransformer
|
||||||
|
decoder_conf:
|
||||||
|
attention_heads: 8
|
||||||
|
linear_units: 2048
|
||||||
|
num_blocks: 3 # the number of encoder blocks
|
||||||
|
r_num_blocks: 3 #only for bitransformer
|
||||||
|
dropout_rate: 0.1
|
||||||
|
positional_dropout_rate: 0.1
|
||||||
|
self_attention_dropout_rate: 0.1
|
||||||
|
src_attention_dropout_rate: 0.1
|
||||||
|
|
||||||
|
# hybrid CTC/attention
|
||||||
|
model_conf:
|
||||||
|
ctc_weight: 0.3
|
||||||
|
lsm_weight: 0.1 # label smoothing option
|
||||||
|
length_normalized_loss: false
|
||||||
|
reverse_weight: 0.3 # only for bitransformer decoder
|
||||||
|
init_type: 'kaiming_uniform' # !Warning: need to convergence
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Data #
|
||||||
|
###########################################
|
||||||
|
train_manifest: data/train_l/data.list
|
||||||
|
dev_manifest: data/dev/data.list
|
||||||
|
test_manifest: data/test_meeting/data.list
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Dataloader #
|
||||||
|
###########################################
|
||||||
|
use_stream_data: True
|
||||||
|
vocab_filepath: data/lang_char/vocab.txt
|
||||||
|
unit_type: 'char'
|
||||||
|
preprocess_config: conf/preprocess.yaml
|
||||||
|
spm_model_prefix: ''
|
||||||
|
feat_dim: 80
|
||||||
|
stride_ms: 10.0
|
||||||
|
window_ms: 25.0
|
||||||
|
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
|
||||||
|
batch_size: 32
|
||||||
|
do_filter: True
|
||||||
|
maxlen_in: 1200 # if do_filter == False && input length > maxlen-in, batchsize is automatically reduced
|
||||||
|
maxlen_out: 100 # if do_filter == False && output length > maxlen-out, batchsize is automatically reduced
|
||||||
|
minlen_in: 10
|
||||||
|
minlen_out: 0
|
||||||
|
minibatches: 0 # for debug
|
||||||
|
batch_count: auto
|
||||||
|
batch_bins: 0
|
||||||
|
batch_frames_in: 0
|
||||||
|
batch_frames_out: 0
|
||||||
|
batch_frames_inout: 0
|
||||||
|
num_workers: 0
|
||||||
|
subsampling_factor: 1
|
||||||
|
num_encs: 1
|
||||||
|
|
||||||
|
###########################################
|
||||||
|
# Training #
|
||||||
|
###########################################
|
||||||
|
n_epoch: 150
|
||||||
|
accum_grad: 8
|
||||||
|
global_grad_clip: 5.0
|
||||||
|
dist_sampler: False
|
||||||
|
optim: adam
|
||||||
|
optim_conf:
|
||||||
|
lr: 0.002
|
||||||
|
weight_decay: 1.0e-6
|
||||||
|
scheduler: warmuplr
|
||||||
|
scheduler_conf:
|
||||||
|
warmup_steps: 25000
|
||||||
|
lr_decay: 1.0
|
||||||
|
log_interval: 100
|
||||||
|
checkpoint:
|
||||||
|
kbest_n: 50
|
||||||
|
latest_n: 5
|
@ -0,0 +1,12 @@
|
|||||||
|
beam_size: 10
|
||||||
|
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||||
|
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
||||||
|
reverse_weight: 0.3 # reverse weight for attention rescoring decode mode.
|
||||||
|
decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
|
||||||
|
# <0: for decoding, use full chunk.
|
||||||
|
# >0: for decoding, use fixed chunk size as set.
|
||||||
|
# 0: used for training, it's prohibited here.
|
||||||
|
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
||||||
|
simulate_streaming: True # simulate streaming inference. Defaults to False.
|
||||||
|
decode_batch_size: 128
|
||||||
|
error_rate_type: cer
|
@ -1,11 +1,12 @@
|
|||||||
decode_batch_size: 128
|
|
||||||
error_rate_type: cer
|
|
||||||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
|
||||||
beam_size: 10
|
beam_size: 10
|
||||||
|
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
||||||
|
reverse_weight: 0.3 # reverse weight for attention rescoring decode mode.
|
||||||
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
|
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
|
||||||
# <0: for decoding, use full chunk.
|
# <0: for decoding, use full chunk.
|
||||||
# >0: for decoding, use fixed chunk size as set.
|
# >0: for decoding, use fixed chunk size as set.
|
||||||
# 0: used for training, it's prohibited here.
|
# 0: used for training, it's prohibited here.
|
||||||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
||||||
simulate_streaming: False # simulate streaming inference. Defaults to False.
|
simulate_streaming: False # simulate streaming inference. Defaults to False.
|
||||||
|
decode_batch_size: 128
|
||||||
|
error_rate_type: cer
|
||||||
|
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# != 4 ];then
|
||||||
|
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
|
||||||
|
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||||
|
echo "using $ngpu gpus..."
|
||||||
|
|
||||||
|
config_path=$1
|
||||||
|
decode_config_path=$2
|
||||||
|
ckpt_prefix=$3
|
||||||
|
audio_file=$4
|
||||||
|
|
||||||
|
mkdir -p data
|
||||||
|
wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ ! -f ${audio_file} ]; then
|
||||||
|
echo "Plase input the right audio_file path"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
chunk_mode=false
|
||||||
|
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
|
||||||
|
chunk_mode=true
|
||||||
|
fi
|
||||||
|
|
||||||
|
# download language model
|
||||||
|
#bash local/download_lm_ch.sh
|
||||||
|
#if [ $? -ne 0 ]; then
|
||||||
|
# exit 1
|
||||||
|
#fi
|
||||||
|
|
||||||
|
for type in attention_rescoring; do
|
||||||
|
echo "decoding ${type}"
|
||||||
|
batch_size=1
|
||||||
|
output_dir=${ckpt_prefix}
|
||||||
|
mkdir -p ${output_dir}
|
||||||
|
python3 -u ${BIN_DIR}/quant.py \
|
||||||
|
--ngpu ${ngpu} \
|
||||||
|
--config ${config_path} \
|
||||||
|
--decode_cfg ${decode_config_path} \
|
||||||
|
--result_file ${output_dir}/${type}.rsl \
|
||||||
|
--checkpoint_path ${ckpt_prefix} \
|
||||||
|
--opts decode.decoding_method ${type} \
|
||||||
|
--opts decode.decode_batch_size ${batch_size} \
|
||||||
|
--audio_file ${audio_file}
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Failed in evaluation!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
exit 0
|
@ -0,0 +1,224 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Evaluation for U2 model."""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import soundfile
|
||||||
|
from paddleslim import PTQ
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from paddlespeech.audio.transform.transformation import Transformation
|
||||||
|
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
|
||||||
|
from paddlespeech.s2t.models.u2 import U2Model
|
||||||
|
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||||
|
from paddlespeech.s2t.utils.log import Log
|
||||||
|
from paddlespeech.s2t.utils.utility import UpdateConfig
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class U2Infer():
|
||||||
|
def __init__(self, config, args):
|
||||||
|
self.args = args
|
||||||
|
self.config = config
|
||||||
|
self.audio_file = args.audio_file
|
||||||
|
|
||||||
|
self.preprocess_conf = config.preprocess_config
|
||||||
|
self.preprocess_args = {"train": False}
|
||||||
|
self.preprocessing = Transformation(self.preprocess_conf)
|
||||||
|
self.text_feature = TextFeaturizer(
|
||||||
|
unit_type=config.unit_type,
|
||||||
|
vocab=config.vocab_filepath,
|
||||||
|
spm_model_prefix=config.spm_model_prefix)
|
||||||
|
|
||||||
|
paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
|
||||||
|
|
||||||
|
# model
|
||||||
|
model_conf = config
|
||||||
|
with UpdateConfig(model_conf):
|
||||||
|
model_conf.input_dim = config.feat_dim
|
||||||
|
model_conf.output_dim = self.text_feature.vocab_size
|
||||||
|
model = U2Model.from_config(model_conf)
|
||||||
|
self.model = model
|
||||||
|
self.model.eval()
|
||||||
|
self.ptq = PTQ()
|
||||||
|
self.model = self.ptq.quantize(model)
|
||||||
|
|
||||||
|
# load model
|
||||||
|
params_path = self.args.checkpoint_path + ".pdparams"
|
||||||
|
model_dict = paddle.load(params_path)
|
||||||
|
self.model.set_state_dict(model_dict)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
check(args.audio_file)
|
||||||
|
|
||||||
|
with paddle.no_grad():
|
||||||
|
# read
|
||||||
|
audio, sample_rate = soundfile.read(
|
||||||
|
self.audio_file, dtype="int16", always_2d=True)
|
||||||
|
audio = audio[:, 0]
|
||||||
|
logger.info(f"audio shape: {audio.shape}")
|
||||||
|
|
||||||
|
# fbank
|
||||||
|
feat = self.preprocessing(audio, **self.preprocess_args)
|
||||||
|
logger.info(f"feat shape: {feat.shape}")
|
||||||
|
|
||||||
|
ilen = paddle.to_tensor(feat.shape[0])
|
||||||
|
xs = paddle.to_tensor(feat, dtype='float32').unsqueeze(0)
|
||||||
|
decode_config = self.config.decode
|
||||||
|
logger.info(f"decode cfg: {decode_config}")
|
||||||
|
reverse_weight = getattr(decode_config, 'reverse_weight', 0.0)
|
||||||
|
result_transcripts = self.model.decode(
|
||||||
|
xs,
|
||||||
|
ilen,
|
||||||
|
text_feature=self.text_feature,
|
||||||
|
decoding_method=decode_config.decoding_method,
|
||||||
|
beam_size=decode_config.beam_size,
|
||||||
|
ctc_weight=decode_config.ctc_weight,
|
||||||
|
decoding_chunk_size=decode_config.decoding_chunk_size,
|
||||||
|
num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
|
||||||
|
simulate_streaming=decode_config.simulate_streaming,
|
||||||
|
reverse_weight=reverse_weight)
|
||||||
|
rsl = result_transcripts[0][0]
|
||||||
|
utt = Path(self.audio_file).name
|
||||||
|
logger.info(f"hyp: {utt} {rsl}")
|
||||||
|
# print(self.model)
|
||||||
|
# print(self.model.forward_encoder_chunk)
|
||||||
|
|
||||||
|
logger.info("-------------start quant ----------------------")
|
||||||
|
batch_size = 1
|
||||||
|
feat_dim = 80
|
||||||
|
model_size = 512
|
||||||
|
num_left_chunks = -1
|
||||||
|
reverse_weight = 0.3
|
||||||
|
logger.info(
|
||||||
|
f"U2 Export Model Params: batch_size {batch_size}, feat_dim {feat_dim}, model_size {model_size}, num_left_chunks {num_left_chunks}, reverse_weight {reverse_weight}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ######################## self.model.forward_encoder_chunk ############
|
||||||
|
# input_spec = [
|
||||||
|
# # (T,), int16
|
||||||
|
# paddle.static.InputSpec(shape=[None], dtype='int16'),
|
||||||
|
# ]
|
||||||
|
# self.model.forward_feature = paddle.jit.to_static(
|
||||||
|
# self.model.forward_feature, input_spec=input_spec)
|
||||||
|
|
||||||
|
######################### self.model.forward_encoder_chunk ############
|
||||||
|
input_spec = [
|
||||||
|
# xs, (B, T, D)
|
||||||
|
paddle.static.InputSpec(
|
||||||
|
shape=[batch_size, None, feat_dim], dtype='float32'),
|
||||||
|
# offset, int, but need be tensor
|
||||||
|
paddle.static.InputSpec(shape=[1], dtype='int32'),
|
||||||
|
# required_cache_size, int
|
||||||
|
num_left_chunks,
|
||||||
|
# att_cache
|
||||||
|
paddle.static.InputSpec(
|
||||||
|
shape=[None, None, None, None], dtype='float32'),
|
||||||
|
# cnn_cache
|
||||||
|
paddle.static.InputSpec(
|
||||||
|
shape=[None, None, None, None], dtype='float32')
|
||||||
|
]
|
||||||
|
self.model.forward_encoder_chunk = paddle.jit.to_static(
|
||||||
|
self.model.forward_encoder_chunk, input_spec=input_spec)
|
||||||
|
|
||||||
|
######################### self.model.ctc_activation ########################
|
||||||
|
input_spec = [
|
||||||
|
# encoder_out, (B,T,D)
|
||||||
|
paddle.static.InputSpec(
|
||||||
|
shape=[batch_size, None, model_size], dtype='float32')
|
||||||
|
]
|
||||||
|
self.model.ctc_activation = paddle.jit.to_static(
|
||||||
|
self.model.ctc_activation, input_spec=input_spec)
|
||||||
|
|
||||||
|
######################### self.model.forward_attention_decoder ########################
|
||||||
|
input_spec = [
|
||||||
|
# hyps, (B, U)
|
||||||
|
paddle.static.InputSpec(shape=[None, None], dtype='int64'),
|
||||||
|
# hyps_lens, (B,)
|
||||||
|
paddle.static.InputSpec(shape=[None], dtype='int64'),
|
||||||
|
# encoder_out, (B,T,D)
|
||||||
|
paddle.static.InputSpec(
|
||||||
|
shape=[batch_size, None, model_size], dtype='float32'),
|
||||||
|
reverse_weight
|
||||||
|
]
|
||||||
|
self.model.forward_attention_decoder = paddle.jit.to_static(
|
||||||
|
self.model.forward_attention_decoder, input_spec=input_spec)
|
||||||
|
################################################################################
|
||||||
|
|
||||||
|
# jit save
|
||||||
|
logger.info(f"export save: {self.args.export_path}")
|
||||||
|
config = {
|
||||||
|
'is_static': True,
|
||||||
|
'combine_params': True,
|
||||||
|
'skip_forward': True
|
||||||
|
}
|
||||||
|
self.ptq.save_quantized_model(self.model, self.args.export_path)
|
||||||
|
# paddle.jit.save(
|
||||||
|
# self.model,
|
||||||
|
# self.args.export_path,
|
||||||
|
# combine_params=True,
|
||||||
|
# skip_forward=True)
|
||||||
|
|
||||||
|
|
||||||
|
def check(audio_file):
|
||||||
|
if not os.path.isfile(audio_file):
|
||||||
|
print("Please input the right audio file path")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
logger.info("checking the audio file format......")
|
||||||
|
try:
|
||||||
|
sig, sample_rate = soundfile.read(audio_file)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(str(e))
|
||||||
|
logger.error(
|
||||||
|
"can not open the wav file, please check the audio file format")
|
||||||
|
sys.exit(-1)
|
||||||
|
logger.info("The sample rate is %d" % sample_rate)
|
||||||
|
assert (sample_rate == 16000)
|
||||||
|
logger.info("The audio file format is right")
|
||||||
|
|
||||||
|
|
||||||
|
def main(config, args):
|
||||||
|
U2Infer(config, args).run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = default_argument_parser()
|
||||||
|
# save asr result to
|
||||||
|
parser.add_argument(
|
||||||
|
"--result_file", type=str, help="path of save the asr result")
|
||||||
|
parser.add_argument(
|
||||||
|
"--audio_file", type=str, help="path of the input audio file")
|
||||||
|
parser.add_argument(
|
||||||
|
"--export_path",
|
||||||
|
type=str,
|
||||||
|
default='export',
|
||||||
|
help="path of the input audio file")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = CfgNode(new_allowed=True)
|
||||||
|
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
if args.decode_cfg:
|
||||||
|
decode_confs = CfgNode(new_allowed=True)
|
||||||
|
decode_confs.merge_from_file(args.decode_cfg)
|
||||||
|
config.decode = decode_confs
|
||||||
|
if args.opts:
|
||||||
|
config.merge_from_list(args.opts)
|
||||||
|
config.freeze()
|
||||||
|
main(config, args)
|
@ -0,0 +1,13 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
@ -0,0 +1,64 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Evaluation for wav2vec2.0 model."""
|
||||||
|
import cProfile
|
||||||
|
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
|
||||||
|
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||||
|
from paddlespeech.s2t.utils.utility import print_arguments
|
||||||
|
|
||||||
|
|
||||||
|
def main_sp(config, args):
|
||||||
|
exp = Tester(config, args)
|
||||||
|
with exp.eval():
|
||||||
|
exp.setup()
|
||||||
|
exp.run_test()
|
||||||
|
|
||||||
|
|
||||||
|
def main(config, args):
|
||||||
|
main_sp(config, args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = default_argument_parser()
|
||||||
|
# save asr result to
|
||||||
|
parser.add_argument(
|
||||||
|
'--dict-path', type=str, default=None, help='dict path.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--result_file", type=str, help="path of save the asr result")
|
||||||
|
args = parser.parse_args()
|
||||||
|
print_arguments(args, globals())
|
||||||
|
|
||||||
|
# https://yaml.org/type/float.html
|
||||||
|
config = CfgNode(new_allowed=True)
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
if args.decode_cfg:
|
||||||
|
decode_confs = CfgNode(new_allowed=True)
|
||||||
|
decode_confs.merge_from_file(args.decode_cfg)
|
||||||
|
config.decode = decode_confs
|
||||||
|
if args.opts:
|
||||||
|
config.merge_from_list(args.opts)
|
||||||
|
config.freeze()
|
||||||
|
print(config)
|
||||||
|
if args.dump_config:
|
||||||
|
with open(args.dump_config, 'w') as f:
|
||||||
|
print(config, file=f)
|
||||||
|
|
||||||
|
# Setting for profiling
|
||||||
|
pr = cProfile.Profile()
|
||||||
|
pr.runcall(main, config, args)
|
||||||
|
pr.dump_stats('test.profile')
|
@ -0,0 +1,118 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Evaluation for wav2vec2.0 model."""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
import soundfile
|
||||||
|
from yacs.config import CfgNode
|
||||||
|
|
||||||
|
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
|
||||||
|
from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
|
||||||
|
from paddlespeech.s2t.training.cli import default_argument_parser
|
||||||
|
from paddlespeech.s2t.utils.log import Log
|
||||||
|
from paddlespeech.s2t.utils.utility import UpdateConfig
|
||||||
|
logger = Log(__name__).getlog()
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2vec2Infer():
|
||||||
|
def __init__(self, config, args):
|
||||||
|
self.args = args
|
||||||
|
self.config = config
|
||||||
|
self.audio_file = args.audio_file
|
||||||
|
|
||||||
|
self.text_feature = TextFeaturizer(
|
||||||
|
unit_type=config.unit_type, vocab=config.vocab_filepath)
|
||||||
|
paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
|
||||||
|
|
||||||
|
# model
|
||||||
|
model_conf = config
|
||||||
|
with UpdateConfig(model_conf):
|
||||||
|
model_conf.output_dim = self.text_feature.vocab_size
|
||||||
|
model = Wav2vec2ASR.from_config(model_conf)
|
||||||
|
self.model = model
|
||||||
|
self.model.eval()
|
||||||
|
|
||||||
|
# load model
|
||||||
|
params_path = self.args.checkpoint_path + ".pdparams"
|
||||||
|
model_dict = paddle.load(params_path)
|
||||||
|
self.model.set_state_dict(model_dict)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
check(args.audio_file)
|
||||||
|
|
||||||
|
with paddle.no_grad():
|
||||||
|
# read
|
||||||
|
audio, _ = soundfile.read(
|
||||||
|
self.audio_file, dtype="int16", always_2d=True)
|
||||||
|
logger.info(f"audio shape: {audio.shape}")
|
||||||
|
|
||||||
|
xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
|
||||||
|
decode_config = self.config.decode
|
||||||
|
result_transcripts, result_tokenids = self.model.decode(
|
||||||
|
xs,
|
||||||
|
text_feature=self.text_feature,
|
||||||
|
decoding_method=decode_config.decoding_method,
|
||||||
|
beam_size=decode_config.beam_size)
|
||||||
|
rsl = result_transcripts[0]
|
||||||
|
utt = Path(self.audio_file).name
|
||||||
|
logger.info(f"hyp: {utt} {rsl}")
|
||||||
|
return rsl
|
||||||
|
|
||||||
|
|
||||||
|
def check(audio_file):
|
||||||
|
if not os.path.isfile(audio_file):
|
||||||
|
print("Please input the right audio file path")
|
||||||
|
sys.exit(-1)
|
||||||
|
|
||||||
|
logger.info("checking the audio file format......")
|
||||||
|
try:
|
||||||
|
sig, sample_rate = soundfile.read(audio_file)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(str(e))
|
||||||
|
logger.error(
|
||||||
|
"can not open the wav file, please check the audio file format")
|
||||||
|
sys.exit(-1)
|
||||||
|
logger.info("The sample rate is %d" % sample_rate)
|
||||||
|
assert (sample_rate == 16000)
|
||||||
|
logger.info("The audio file format is right")
|
||||||
|
|
||||||
|
|
||||||
|
def main(config, args):
|
||||||
|
Wav2vec2Infer(config, args).run()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = default_argument_parser()
|
||||||
|
# save asr result to
|
||||||
|
parser.add_argument(
|
||||||
|
"--result_file", type=str, help="path of save the asr result")
|
||||||
|
parser.add_argument(
|
||||||
|
"--audio_file", type=str, help="path of the input audio file")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
config = CfgNode(new_allowed=True)
|
||||||
|
|
||||||
|
if args.config:
|
||||||
|
config.merge_from_file(args.config)
|
||||||
|
if args.decode_cfg:
|
||||||
|
decode_confs = CfgNode(new_allowed=True)
|
||||||
|
decode_confs.merge_from_file(args.decode_cfg)
|
||||||
|
config.decode = decode_confs
|
||||||
|
if args.opts:
|
||||||
|
config.merge_from_list(args.opts)
|
||||||
|
config.freeze()
|
||||||
|
main(config, args)
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue