commit
3d5aac6a94
@ -0,0 +1,11 @@
|
||||
# Changelog
|
||||
|
||||
|
||||
Date: 2022-1-10, Author: Jackwaterveg.
|
||||
Add features to: CLI:
|
||||
- Support English (librispeech/asr1/transformer).
|
||||
- Support choosing `decode_method` for conformer and transformer models.
|
||||
- Refactor the config, using the unified config.
|
||||
- PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
|
||||
|
||||
***
|
@ -1,10 +1,15 @@
|
||||
# Speech Application based on PaddleSpeech
|
||||
|
||||
([简体中文](./README_cn.md)|English)
|
||||
|
||||
The directory containes many speech applications in multi scenarios.
|
||||
|
||||
* audio tagging - tag audio label in vedio
|
||||
* audio tagging - multi-label tagging of an audio file
|
||||
* automatic_video_subtitiles - generate subtitles from a video
|
||||
* metaverse - 2D AR with TTS
|
||||
* speech recogintion - vidio understanding
|
||||
* punctuation_restoration - restore punctuation from raw text
|
||||
* speech recogintion - recognize text of an audio file
|
||||
* speech translation - end to end speech translation
|
||||
* story talker - book reader based on OCR and TTS
|
||||
* style_fs2 - multi style control for FastSpeech2 model
|
||||
* text_to_speech - convert text into speech
|
||||
|
@ -0,0 +1,128 @@
|
||||
# Customize Dataset for Audio Classification
|
||||
|
||||
Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech` and `paddleaudio`.
|
||||
|
||||
A base class of classification dataset is `paddleaudio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`.
|
||||
|
||||
Assuming you have some wave files that stored in your own directory. You should prepare a meta file with the information of filepaths and labels. For example the absolute path of it is `/PATH/TO/META_FILE.txt`:
|
||||
```
|
||||
/PATH/TO/WAVE_FILE/1.wav cat
|
||||
/PATH/TO/WAVE_FILE/2.wav cat
|
||||
/PATH/TO/WAVE_FILE/3.wav dog
|
||||
/PATH/TO/WAVE_FILE/4.wav dog
|
||||
```
|
||||
Here is an example to build your custom dataset in `custom_dataset.py`:
|
||||
|
||||
```python
|
||||
from paddleaudio.datasets.dataset import AudioClassificationDataset
|
||||
|
||||
class CustomDataset(AudioClassificationDataset):
|
||||
meta_file = '/PATH/TO/META_FILE.txt'
|
||||
# List all the class labels
|
||||
label_list = [
|
||||
'cat',
|
||||
'dog',
|
||||
]
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
files, labels = self._get_data()
|
||||
super(CustomDataset, self).__init__(
|
||||
files=files, labels=labels, feat_type='raw', **kwargs)
|
||||
|
||||
def _get_data(self):
|
||||
'''
|
||||
This method offer information of wave files and labels.
|
||||
'''
|
||||
files = []
|
||||
labels = []
|
||||
|
||||
with open(self.meta_file) as f:
|
||||
for line in f:
|
||||
file, label_str = line.strip().split(' ')
|
||||
files.append(file)
|
||||
labels.append(self.label_list.index(label_str))
|
||||
|
||||
return files, labels
|
||||
```
|
||||
|
||||
Then you can build dataset and data loader from `CustomDataset`:
|
||||
```python
|
||||
import paddle
|
||||
from paddleaudio.features import LogMelSpectrogram
|
||||
|
||||
from custom_dataset import CustomDataset
|
||||
|
||||
# Feature config should be align with pretrained model
|
||||
sample_rate = 32000
|
||||
feat_conf = {
|
||||
'sr': sample_rate,
|
||||
'n_fft': 1024,
|
||||
'hop_length': 320,
|
||||
'window': 'hann',
|
||||
'win_length': 1024,
|
||||
'f_min': 50.0,
|
||||
'f_max': 14000.0,
|
||||
'n_mels': 64,
|
||||
}
|
||||
|
||||
train_ds = CustomDataset(sample_rate=sample_rate)
|
||||
feature_extractor = LogMelSpectrogram(**feat_conf)
|
||||
|
||||
train_sampler = paddle.io.DistributedBatchSampler(
|
||||
train_ds, batch_size=4, shuffle=True, drop_last=False)
|
||||
train_loader = paddle.io.DataLoader(
|
||||
train_ds,
|
||||
batch_sampler=train_sampler,
|
||||
return_list=True,
|
||||
use_buffer_reader=True)
|
||||
```
|
||||
|
||||
Train model with `CustomDataset`:
|
||||
```python
|
||||
from paddlespeech.cls.models import cnn14
|
||||
from paddlespeech.cls.models import SoundClassifier
|
||||
|
||||
backbone = cnn14(pretrained=True, extract_embedding=True)
|
||||
model = SoundClassifier(backbone, num_class=len(train_ds.label_list))
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=1e-6, parameters=model.parameters())
|
||||
criterion = paddle.nn.loss.CrossEntropyLoss()
|
||||
|
||||
steps_per_epoch = len(train_sampler)
|
||||
epochs = 10
|
||||
for epoch in range(1, epochs + 1):
|
||||
model.train()
|
||||
|
||||
for batch_idx, batch in enumerate(train_loader):
|
||||
waveforms, labels = batch
|
||||
# Need a padding when lengths of waveforms differ in a batch.
|
||||
feats = feature_extractor(waveforms)
|
||||
feats = paddle.transpose(feats, [0, 2, 1])
|
||||
logits = model(feats)
|
||||
loss = criterion(logits, labels)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
if isinstance(optimizer._learning_rate,
|
||||
paddle.optimizer.lr.LRScheduler):
|
||||
optimizer._learning_rate.step()
|
||||
optimizer.clear_grad()
|
||||
|
||||
# Calculate loss
|
||||
avg_loss = loss.numpy()[0]
|
||||
|
||||
# Calculate metrics
|
||||
preds = paddle.argmax(logits, axis=1)
|
||||
num_corrects = (preds == labels).numpy().sum()
|
||||
num_samples = feats.shape[0]
|
||||
|
||||
avg_acc = num_corrects / num_samples
|
||||
|
||||
print_msg = 'Epoch={}/{}, Step={}/{}'.format(
|
||||
epoch, epochs, batch_idx + 1, steps_per_epoch)
|
||||
print_msg += ' loss={:.4f}'.format(avg_loss)
|
||||
print_msg += ' acc={:.4f}'.format(avg_acc)
|
||||
print_msg += ' lr={:.6f}'.format(optimizer.get_lr())
|
||||
print(print_msg)
|
||||
```
|
||||
|
||||
If you want to save the checkpoint of model and evaluate from a specific dataset, please see `paddlespeech/cli/exp/panns/train.py` for more details.
|
@ -0,0 +1,51 @@
|
||||
# Quick Start of Audio Classification
|
||||
Several shell scripts provided in `./examples/esc50/cls0` will help us to quickly give it a try, for most major modules, including data preparation, model training, model evaluation, with [ESC50](ttps://github.com/karolpiczak/ESC-50) dataset.
|
||||
|
||||
Some of the scripts in `./examples` are not configured with GPUs. If you want to train with 8 GPUs, please modify `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7`. If you don't have any GPU available, please set `CUDA_VISIBLE_DEVICES=` to use CPUs instead.
|
||||
|
||||
Let's start a audio classification task with the following steps:
|
||||
|
||||
- Go to the directory
|
||||
|
||||
```bash
|
||||
cd examples/esc50/cls0
|
||||
```
|
||||
|
||||
- Source env
|
||||
```bash
|
||||
source path.sh
|
||||
```
|
||||
|
||||
- Main entry point
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=0 ./run.sh 1
|
||||
```
|
||||
|
||||
This demo includes fine-tuning, evaluating and deploying a audio classificatio model. More detailed information is provided in the following sections.
|
||||
|
||||
## Fine-tuning a model
|
||||
PANNs([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf)) are pretrained models with [Audioset](https://research.google.com/audioset/). They can be easily used to extract audio embeddings for audio classification task.
|
||||
|
||||
To start a model fine-tuning, please run:
|
||||
```bash
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
feat_backend=numpy
|
||||
./local/train.sh ${ngpu} ${feat_backend}
|
||||
```
|
||||
|
||||
## Deploy a model
|
||||
Once you save a model checkpoint, you can export it to static graph and deploy by python scirpt:
|
||||
|
||||
- Export to a static graph
|
||||
```bash
|
||||
./local/export.sh ${ckpt_dir} ./export
|
||||
```
|
||||
The argument `ckpt_dir` should be a directory in which a model checkpoint stored, for example `checkpoint/epoch_50`.
|
||||
|
||||
The static graph will be exported to `./export`.
|
||||
|
||||
- Inference
|
||||
```bash
|
||||
./local/static_model_infer.sh ${infer_device} ./export ${audio_file}
|
||||
```
|
||||
The argument `infer_device` can be `cpu` or `gpu`, and it means which device to be used to infer. And `audio_file` should be a wave file with name `*.wav`.
|
@ -0,0 +1,42 @@
|
||||
# TTS Papers
|
||||
## Text Frontend
|
||||
### Polyphone
|
||||
- [【g2pM】g2pM: A Neural Grapheme-to-Phoneme Conversion Package for Mandarin Chinese Based on a New Open Benchmark Dataset](https://arxiv.org/abs/2004.03136)
|
||||
- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
|
||||
### Text Normalization
|
||||
#### English
|
||||
- [applenob/text_normalization](https://github.com/applenob/text_normalization)
|
||||
### G2P
|
||||
#### English
|
||||
- [cmusphinx/g2p-seq2seq](https://github.com/cmusphinx/g2p-seq2seq)
|
||||
|
||||
## Acoustic Models
|
||||
- [【AdaSpeech3】AdaSpeech 3: Adaptive Text to Speech for Spontaneous Style](https://arxiv.org/abs/2107.02530)
|
||||
- [【AdaSpeech2】AdaSpeech 2: Adaptive Text to Speech with Untranscribed Data](https://arxiv.org/abs/2104.09715)
|
||||
- [【AdaSpeech】AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/abs/2103.00993)
|
||||
- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558)
|
||||
- [【FastPitch】FastPitch: Parallel Text-to-speech with Pitch Prediction](https://arxiv.org/abs/2006.06873)
|
||||
- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802)
|
||||
- [【FastSpeech】FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263)
|
||||
- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
|
||||
- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
|
||||
|
||||
## Vocoders
|
||||
- [【RefineGAN】RefineGAN: Universally Generating Waveform Better than Ground Truth with Highly Accurate Pitch and Intensity Responses](https://arxiv.org/abs/2111.00962)
|
||||
- [【Fre-GAN】Fre-GAN: Adversarial Frequency-consistent Audio Synthesis](https://arxiv.org/abs/2106.02297)
|
||||
- [【StyleMelGAN】StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization](https://arxiv.org/abs/2011.01557)
|
||||
- [【Multi-band MelGAN】Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech](https://arxiv.org/abs/2005.05106)
|
||||
- [【HiFi-GAN】HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646)
|
||||
- [【VocGAN】VocGAN: A High-Fidelity Real-time Vocoder with a Hierarchically-nested Adversarial Network](https://arxiv.org/abs/2007.15256)
|
||||
- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
|
||||
- [【MelGAN】MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis](https://arxiv.org/abs/1910.06711)
|
||||
- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
|
||||
- [【LPCNet】LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://arxiv.org/abs/1810.11846)
|
||||
- [【WaveRNN】Efficient Neural Audio Synthesis](https://arxiv.org/abs/1802.08435)
|
||||
## GAN TTS
|
||||
|
||||
- [【GAN TTS】High Fidelity Speech Synthesis with Adversarial Networks](https://arxiv.org/abs/1909.11646)
|
||||
|
||||
## Voice Cloning
|
||||
- [【SV2TTS】Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/abs/1806.04558)
|
||||
- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
|
@ -1,70 +1,68 @@
|
||||
# https://yaml.org/type/float.html
|
||||
data:
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test
|
||||
min_input_len: 0.0
|
||||
max_input_len: 27.0 # second
|
||||
min_output_len: 0.0
|
||||
max_output_len: .inf
|
||||
min_output_input_ratio: 0.00
|
||||
max_output_input_ratio: .inf
|
||||
###########################################
|
||||
# Data #
|
||||
###########################################
|
||||
train_manifest: data/manifest.train
|
||||
dev_manifest: data/manifest.dev
|
||||
test_manifest: data/manifest.test
|
||||
min_input_len: 0.0
|
||||
max_input_len: 27.0 # second
|
||||
min_output_len: 0.0
|
||||
max_output_len: .inf
|
||||
min_output_input_ratio: 0.00
|
||||
max_output_input_ratio: .inf
|
||||
|
||||
collator:
|
||||
batch_size: 64 # one gpu
|
||||
mean_std_filepath: data/mean_std.json
|
||||
unit_type: char
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
augmentation_config: conf/augmentation.json
|
||||
random_seed: 0
|
||||
spm_model_prefix:
|
||||
spectrum_type: linear #linear, mfcc, fbank
|
||||
feat_dim:
|
||||
delta_delta: False
|
||||
stride_ms: 10.0
|
||||
window_ms: 20.0
|
||||
n_fft: None
|
||||
max_freq: None
|
||||
target_sample_rate: 16000
|
||||
use_dB_normalization: True
|
||||
target_dB: -20
|
||||
dither: 1.0
|
||||
keep_transcription_text: False
|
||||
sortagrad: True
|
||||
shuffle_method: batch_shuffle
|
||||
num_workers: 0
|
||||
###########################################
|
||||
# Dataloader #
|
||||
###########################################
|
||||
batch_size: 64 # one gpu
|
||||
mean_std_filepath: data/mean_std.json
|
||||
unit_type: char
|
||||
vocab_filepath: data/lang_char/vocab.txt
|
||||
augmentation_config: conf/augmentation.json
|
||||
random_seed: 0
|
||||
spm_model_prefix:
|
||||
spectrum_type: linear #linear, mfcc, fbank
|
||||
feat_dim: 161
|
||||
delta_delta: False
|
||||
stride_ms: 10.0
|
||||
window_ms: 20.0
|
||||
n_fft: None
|
||||
max_freq: None
|
||||
target_sample_rate: 16000
|
||||
use_dB_normalization: True
|
||||
target_dB: -20
|
||||
dither: 1.0
|
||||
keep_transcription_text: False
|
||||
sortagrad: True
|
||||
shuffle_method: batch_shuffle
|
||||
num_workers: 0
|
||||
|
||||
model:
|
||||
num_conv_layers: 2
|
||||
num_rnn_layers: 5
|
||||
rnn_layer_size: 1024
|
||||
rnn_direction: forward # [forward, bidirect]
|
||||
num_fc_layers: 0
|
||||
fc_layers_size_list: -1,
|
||||
use_gru: False
|
||||
blank_id: 0
|
||||
############################################
|
||||
# Network Architecture #
|
||||
############################################
|
||||
num_conv_layers: 2
|
||||
num_rnn_layers: 5
|
||||
rnn_layer_size: 1024
|
||||
rnn_direction: forward # [forward, bidirect]
|
||||
num_fc_layers: 0
|
||||
fc_layers_size_list: -1,
|
||||
use_gru: False
|
||||
blank_id: 0
|
||||
|
||||
|
||||
training:
|
||||
n_epoch: 65
|
||||
accum_grad: 1
|
||||
lr: 5e-4
|
||||
lr_decay: 0.93
|
||||
weight_decay: 1e-06
|
||||
global_grad_clip: 3.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
###########################################
|
||||
# Training #
|
||||
###########################################
|
||||
n_epoch: 65
|
||||
accum_grad: 1
|
||||
lr: 5.0e-4
|
||||
lr_decay: 0.93
|
||||
weight_decay: 1.0e-6
|
||||
global_grad_clip: 3.0
|
||||
log_interval: 100
|
||||
checkpoint:
|
||||
kbest_n: 50
|
||||
latest_n: 5
|
||||
|
||||
decoding:
|
||||
batch_size: 32
|
||||
error_rate_type: cer
|
||||
decoding_method: ctc_beam_search
|
||||
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
|
||||
alpha: 2.2 #1.9
|
||||
beta: 4.3
|
||||
beam_size: 300
|
||||
cutoff_prob: 0.99
|
||||
cutoff_top_n: 40
|
||||
num_proc_bsearch: 10
|
||||
|
||||
|
@ -0,0 +1,10 @@
|
||||
chunk_batch_size: 32
|
||||
error_rate_type: cer
|
||||
decoding_method: ctc_beam_search
|
||||
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
|
||||
alpha: 2.2 #1.9
|
||||
beta: 4.3
|
||||
beam_size: 300
|
||||
cutoff_prob: 0.99
|
||||
cutoff_top_n: 40
|
||||
num_proc_bsearch: 10
|
@ -0,0 +1,10 @@
|
||||
decode_batch_size: 128
|
||||
error_rate_type: cer
|
||||
decoding_method: ctc_beam_search
|
||||
lang_model_path: data/lm/zh_giga.no_cna_cmn.prune01244.klm
|
||||
alpha: 1.9
|
||||
beta: 5.0
|
||||
beam_size: 300
|
||||
cutoff_prob: 0.99
|
||||
cutoff_top_n: 40
|
||||
num_proc_bsearch: 10
|
@ -0,0 +1,11 @@
|
||||
beam_size: 10
|
||||
decode_batch_size: 128
|
||||
error_rate_type: cer
|
||||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
||||
decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
|
||||
# <0: for decoding, use full chunk.
|
||||
# >0: for decoding, use fixed chunk size as set.
|
||||
# 0: used for training, it's prohibited here.
|
||||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
||||
simulate_streaming: True # simulate streaming inference. Defaults to False.
|
@ -0,0 +1,11 @@
|
||||
beam_size: 10
|
||||
decode_batch_size: 128
|
||||
error_rate_type: cer
|
||||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
||||
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
|
||||
# <0: for decoding, use full chunk.
|
||||
# >0: for decoding, use fixed chunk size as set.
|
||||
# 0: used for training, it's prohibited here.
|
||||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
||||
simulate_streaming: False # simulate streaming inference. Defaults to False.
|
@ -0,0 +1,11 @@
|
||||
decode_batch_size: 128
|
||||
error_rate_type: cer
|
||||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||
beam_size: 10
|
||||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
||||
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
|
||||
# <0: for decoding, use full chunk.
|
||||
# >0: for decoding, use fixed chunk size as set.
|
||||
# 0: used for training, it's prohibited here.
|
||||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
||||
simulate_streaming: true # simulate streaming inference. Defaults to False.
|
@ -0,0 +1,13 @@
|
||||
decode_batch_size: 128
|
||||
error_rate_type: cer
|
||||
decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
|
||||
beam_size: 10
|
||||
ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
|
||||
decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
|
||||
# <0: for decoding, use full chunk.
|
||||
# >0: for decoding, use fixed chunk size as set.
|
||||
# 0: used for training, it's prohibited here.
|
||||
num_decoding_left_chunks: -1 # number of left chunks for decoding. Defaults to -1.
|
||||
simulate_streaming: False # simulate streaming inference. Defaults to False.
|
||||
|
||||
|
@ -1,85 +0,0 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import argparse
|
||||
import os
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
|
||||
|
||||
def main():
|
||||
# parse config and args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features .")
|
||||
|
||||
parser.add_argument(
|
||||
"--old-dump-dir",
|
||||
default=None,
|
||||
type=str,
|
||||
help="directory to dump feature files.")
|
||||
parser.add_argument(
|
||||
"--dump-dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory to finetune dump feature files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
old_dump_dir = Path(args.old_dump_dir).expanduser()
|
||||
old_dump_dir = old_dump_dir.resolve()
|
||||
dump_dir = Path(args.dump_dir).expanduser()
|
||||
# use absolute path
|
||||
dump_dir = dump_dir.resolve()
|
||||
dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
assert old_dump_dir.is_dir()
|
||||
assert dump_dir.is_dir()
|
||||
|
||||
for sub in ["train", "dev", "test"]:
|
||||
# 把 old_dump_dir 里面的 *-wave.npy 软连接到 dump_dir 的对应位置
|
||||
output_dir = dump_dir / sub
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
results = []
|
||||
for name in os.listdir(output_dir / "raw"):
|
||||
# 003918_feats.npy
|
||||
utt_id = name.split("_")[0]
|
||||
mel_path = output_dir / ("raw/" + name)
|
||||
gen_mel = np.load(mel_path)
|
||||
wave_name = utt_id + "_wave.npy"
|
||||
wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
|
||||
os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
|
||||
output_dir / ("raw/" + wave_name))
|
||||
num_sample = wav.shape[0]
|
||||
num_frames = gen_mel.shape[0]
|
||||
wav_path = output_dir / ("raw/" + wave_name)
|
||||
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"num_samples": num_sample,
|
||||
"num_frames": num_frames,
|
||||
"feats": str(mel_path),
|
||||
"wave": str(wav_path),
|
||||
}
|
||||
results.append(record)
|
||||
|
||||
results.sort(key=itemgetter("utt_id"))
|
||||
|
||||
with jsonlines.open(output_dir / "raw/metadata.jsonl", 'w') as writer:
|
||||
for item in results:
|
||||
writer.write(item)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,9 @@
|
||||
## Metrics
|
||||
|
||||
5-fold cross validation accuracy on [ESC-50](https://github.com/karolpiczak/ESC-50) dataset:
|
||||
|
||||
|Model|Acc|
|
||||
|--|--|
|
||||
|CNN14| 0.9500
|
||||
|CNN10| 0.8975
|
||||
|CNN6| 0.8825
|
@ -0,0 +1,36 @@
|
||||
data:
|
||||
dataset: 'paddleaudio.datasets:ESC50'
|
||||
num_classes: 50
|
||||
train:
|
||||
mode: 'train'
|
||||
split: 1
|
||||
dev:
|
||||
mode: 'dev'
|
||||
split: 1
|
||||
|
||||
model:
|
||||
backbone: 'paddlespeech.cls.models:cnn14'
|
||||
|
||||
feature:
|
||||
sr: 32000
|
||||
n_fft: 1024
|
||||
hop_length: 320
|
||||
window: 'hann'
|
||||
win_length: 1024
|
||||
f_min: 50.0
|
||||
f_max: 14000.0
|
||||
n_mels: 64
|
||||
|
||||
training:
|
||||
epochs: 50
|
||||
learning_rate: 0.00005
|
||||
num_workers: 2
|
||||
batch_size: 16
|
||||
checkpoint_dir: './checkpoint'
|
||||
save_freq: 10
|
||||
log_freq: 10
|
||||
|
||||
predicting:
|
||||
audio_file: '/audio/dog.wav'
|
||||
top_k: 10
|
||||
checkpoint: './checkpoint/epoch_50/model.pdparams'
|
@ -1,8 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
ckpt_dir=$1
|
||||
ckpt=$1
|
||||
output_dir=$2
|
||||
|
||||
python3 ${BIN_DIR}/export_model.py \
|
||||
--checkpoint ${ckpt_dir}/model.pdparams \
|
||||
--checkpoint ${ckpt} \
|
||||
--output_dir ${output_dir}
|
||||
|
@ -1,11 +1,4 @@
|
||||
#!/bin/bash
|
||||
|
||||
audio_file=$1
|
||||
ckpt_dir=$2
|
||||
feat_backend=$3
|
||||
|
||||
python3 ${BIN_DIR}/predict.py \
|
||||
--wav ${audio_file} \
|
||||
--feat_backend ${feat_backend} \
|
||||
--top_k 10 \
|
||||
--checkpoint ${ckpt_dir}/model.pdparams
|
||||
--cfg_path=$1
|
||||
|
@ -1,25 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
ngpu=$1
|
||||
feat_backend=$2
|
||||
|
||||
num_epochs=50
|
||||
batch_size=16
|
||||
ckpt_dir=./checkpoint
|
||||
save_freq=10
|
||||
cfg_path=$2
|
||||
|
||||
if [ ${ngpu} -gt 0 ]; then
|
||||
python3 -m paddle.distributed.launch --gpus $CUDA_VISIBLE_DEVICES ${BIN_DIR}/train.py \
|
||||
--epochs ${num_epochs} \
|
||||
--feat_backend ${feat_backend} \
|
||||
--batch_size ${batch_size} \
|
||||
--checkpoint_dir ${ckpt_dir} \
|
||||
--save_freq ${save_freq}
|
||||
--cfg_path ${cfg_path}
|
||||
else
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--epochs ${num_epochs} \
|
||||
--feat_backend ${feat_backend} \
|
||||
--batch_size ${batch_size} \
|
||||
--checkpoint_dir ${ckpt_dir} \
|
||||
--save_freq ${save_freq}
|
||||
--cfg_path ${cfg_path}
|
||||
fi
|
||||
|
@ -0,0 +1,44 @@
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
dataset_type: Ernie
|
||||
train_path: data/iwslt2012_zh/train.txt
|
||||
dev_path: data/iwslt2012_zh/dev.txt
|
||||
test_path: data/iwslt2012_zh/test.txt
|
||||
batch_size: 64
|
||||
num_workers: 2
|
||||
data_params:
|
||||
pretrained_token: ernie-1.0
|
||||
punc_path: data/iwslt2012_zh/punc_vocab
|
||||
seq_len: 100
|
||||
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model_type: ErnieLinear
|
||||
model:
|
||||
pretrained_token: ernie-1.0
|
||||
num_classes: 4
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer_params:
|
||||
weight_decay: 1.0e-6 # weight decay coefficient.
|
||||
|
||||
scheduler_params:
|
||||
learning_rate: 1.0e-5 # learning rate.
|
||||
gamma: 1.0 # scheduler gamma.
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 20
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_snapshots: 10 # max number of snapshots to keep while training
|
||||
seed: 42 # random seed for paddle, random, and np.random
|
@ -1,36 +0,0 @@
|
||||
data:
|
||||
dataset_type: Ernie
|
||||
train_path: data/iwslt2012_zh/train.txt
|
||||
dev_path: data/iwslt2012_zh/dev.txt
|
||||
test_path: data/iwslt2012_zh/test.txt
|
||||
data_params:
|
||||
pretrained_token: ernie-1.0
|
||||
punc_path: data/iwslt2012_zh/punc_vocab
|
||||
seq_len: 100
|
||||
batch_size: 64
|
||||
sortagrad: True
|
||||
shuffle_method: batch_shuffle
|
||||
num_workers: 0
|
||||
|
||||
checkpoint:
|
||||
kbest_n: 5
|
||||
latest_n: 10
|
||||
metric_type: F1
|
||||
|
||||
model_type: ErnieLinear
|
||||
|
||||
model_params:
|
||||
pretrained_token: ernie-1.0
|
||||
num_classes: 4
|
||||
|
||||
training:
|
||||
n_epoch: 100
|
||||
lr: !!float 1e-5
|
||||
lr_decay: 1.0
|
||||
weight_decay: !!float 1e-06
|
||||
global_grad_clip: 5.0
|
||||
log_interval: 10
|
||||
log_path: log/train_ernie_linear.log
|
||||
|
||||
testing:
|
||||
log_path: log/test_ernie_linear.log
|
@ -1,23 +0,0 @@
|
||||
#! /usr/bin/env bash
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "usage: ${0} ckpt_dir avg_num"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ckpt_dir=${1}
|
||||
average_num=${2}
|
||||
decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
|
||||
|
||||
python3 -u ${BIN_DIR}/avg_model.py \
|
||||
--dst_model ${decode_checkpoint} \
|
||||
--ckpt_dir ${ckpt_dir} \
|
||||
--num ${average_num} \
|
||||
--val_best
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in avg ckpt!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
config_path=$1
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
text=$4
|
||||
ckpt_prefix=${ckpt_name%.*}
|
||||
|
||||
python3 ${BIN_DIR}/punc_restore.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
|
||||
--text=${text}
|
@ -1,26 +1,11 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 2 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
|
||||
python3 -u ${BIN_DIR}/test.py \
|
||||
--ngpu 1 \
|
||||
--config ${config_path} \
|
||||
--result_file ${ckpt_prefix}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix}
|
||||
train_output_path=$2
|
||||
ckpt_name=$3
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
ckpt_prefix=${ckpt_name%.*}
|
||||
|
||||
exit 0
|
||||
python3 ${BIN_DIR}/test.py \
|
||||
--config=${config_path} \
|
||||
--checkpoint=${train_output_path}/checkpoints/${ckpt_name}
|
||||
|
@ -1,28 +1,9 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 3 ];then
|
||||
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name log_dir"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_name=$2
|
||||
log_dir=$3
|
||||
|
||||
mkdir -p exp
|
||||
|
||||
python3 -u ${BIN_DIR}/train.py \
|
||||
--ngpu ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--output_dir exp/${ckpt_name} \
|
||||
--log_dir ${log_dir}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in training!"
|
||||
exit 1
|
||||
fi
|
||||
train_output_path=$2
|
||||
|
||||
exit 0
|
||||
python3 ${BIN_DIR}/train.py \
|
||||
--config=${config_path} \
|
||||
--output-dir=${train_output_path} \
|
||||
--ngpu=1
|
||||
|
@ -1,40 +1,35 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
source path.sh
|
||||
|
||||
if [ $# -ne 4 ]; then
|
||||
echo "usage: bash ./run.sh stage gpu train_config avg_num"
|
||||
echo "eg: bash ./run.sh 1 0 train_config 1"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
stage=$1
|
||||
gpus=0,1
|
||||
stage=0
|
||||
stop_stage=100
|
||||
gpus=$2
|
||||
conf_path=$3
|
||||
avg_num=$4
|
||||
avg_ckpt=avg_${avg_num}
|
||||
ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
|
||||
log_dir=log
|
||||
|
||||
source path.sh ${ckpt}
|
||||
conf_path=conf/default.yaml
|
||||
train_output_path=exp/default
|
||||
ckpt_name=snapshot_iter_12840.pdz
|
||||
text=今天的天气真不错啊你下午有空吗我想约你一起去吃饭
|
||||
|
||||
# with the following command, you can choose the stage range you want to run
|
||||
# such as `./run.sh --stage 0 --stop-stage 0`
|
||||
# this can not be mixed use with `$1`, `$2` ...
|
||||
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# prepare data
|
||||
bash ./local/data.sh
|
||||
./local/data.sh
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
# train model, all `ckpt` under `exp` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${conf_path} ${ckpt} ${log_dir}
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# train model, all `ckpt` under `train_output_path/checkpoints/` dir
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# avg n best model
|
||||
bash ./local/avg.sh exp/${ckpt}/checkpoints ${avg_num}
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
# test ckpt avg_n
|
||||
CUDA_VISIBLE_DEVICES=${gpus} bash ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
CUDA_VISIBLE_DEVICES=${gpus} ./local/punc_restore.sh ${conf_path} ${train_output_path} ${ckpt_name} ${text}|| exit -1
|
||||
fi
|
@ -0,0 +1,10 @@
|
||||
decode_batch_size: 128
|
||||
error_rate_type: wer
|
||||
decoding_method: ctc_beam_search
|
||||
lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
|
||||
alpha: 1.9
|
||||
beta: 0.3
|
||||
beam_size: 500
|
||||
cutoff_prob: 1.0
|
||||
cutoff_top_n: 40
|
||||
num_proc_bsearch: 8
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue