pull/1019/head
Hui Zhang 3 years ago
parent 7ec0ed4aaf
commit 56480e1033

@ -28,4 +28,4 @@ Need set `decoding.decoding_chunk_size=16` when decoding.
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |
| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |

@ -82,7 +82,7 @@ def create_manifest(data_dir, manifest_path_prefix):
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)

@ -73,7 +73,6 @@ def create_manifest(data_dir, manifest_path_prefix):
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
translation_str = " ".join(translation.split())
trancription_str = " ".join(trancription.split())
json_lines.append(
@ -82,7 +81,7 @@ def create_manifest(data_dir, manifest_path_prefix):
'utt': utt,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': [translation_str, trancription_str],
'text': [translation_str, trancription_str],
},
ensure_ascii=False))

@ -124,7 +124,7 @@ def create_manifest(data_dir, manifest_path_prefix):
json.dumps(
{
'utt': audio_id,
'utt2spk', spk,
'utt2spk': spk,
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': word_text, # charactor

@ -22,9 +22,9 @@ import argparse
import codecs
import json
import os
from pathlib import Path
import soundfile
from pathlib import Path
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(

@ -24,4 +24,4 @@
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention | 6.805267604192098, | 0.049795 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_greedy_search | 6.805267604192098, | 0.054892 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | ctc_prefix_beam_search | 6.805267604192098, | 0.054531 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |
| transformer | 32.52 M | conf/transformer.yaml | spec_aug | test-clean | attention_rescoring | 6.805267604192098, | 0.042244 |

@ -4,4 +4,4 @@ asr model with phone unit
* asr0 - deepspeech2 Streaming/Non-Streaming
* asr1 - transformer/conformer Streaming/Non-Streaming
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature
* asr2 - transformer/conformer Streaming/Non-Streaming with Kaldi feature

@ -55,4 +55,4 @@ As shown in the following table, we provide 3 training subsets, namely `S`, `M`
|-----------------|-------|--------------|-----------------------------------------------------------------------------------------|
| DEV | 20 | Internet | Specially designed for some speech tools which require cross-validation set in training |
| TEST\_NET | 23 | Internet | Match test |
| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset |
| TEST\_MEETING | 15 | Real meeting | Mismatch test which is a far-field, conversational, spontaneous, and meeting dataset |

@ -21,4 +21,4 @@ Pretrain model from http://mobvoi-speech-public.ufile.ucloud.cn/public/wenet/wen
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention | - | 0.048456 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_greedy_search | - | 0.052534 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | ctc_prefix_beam_search | - | 0.052915 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 |
| conformer | 32.52 M | conf/conformer.yaml | spec_aug | aishell1 | attention_rescoring | - | 0.047904 |

@ -1,6 +1,18 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang)
# Mobvoi Inc(Author: Di Wu, Binbin Zhang)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -12,11 +24,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import argparse
import json
import os
import sys
def get_args():
@ -85,13 +96,13 @@ def meta_analysis(input_json, output_dir):
else:
utt2text.write(f'{sid}\t{text}\n')
segments.write(
f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
)
f'{sid}\t{aid}\t{start_time}\t{end_time}\n')
utt2dur.write(f'{sid}\t{dur}\n')
segment_sub_names = " ".join(segment_subsets)
utt2subsets.write(
f'{sid}\t{segment_sub_names}\n')
def main():
args = get_args()
@ -99,4 +110,4 @@ def main():
if __name__ == '__main__':
main()
main()

@ -1,5 +1,17 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao)
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
@ -11,14 +23,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# process_opus.py: segmentation and downsampling of opus audio
# usage: python3 process_opus.py wav.scp segments output_wav.scp
import os
import sys
from pydub import AudioSegment
import sys
import os
def read_file(wav_scp, segments):
@ -86,4 +96,4 @@ def main():
if __name__ == '__main__':
main()
main()

@ -24,15 +24,10 @@ import jsonlines
import numpy as np
import paddle
from paddle import distributed as dist
from paddle.io import DataLoader
from yacs.config import CfgNode
from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
from paddlespeech.s2t.io.collator import SpeechCollator
from paddlespeech.s2t.io.dataloader import BatchDataLoader
from paddlespeech.s2t.io.dataset import ManifestDataset
from paddlespeech.s2t.io.sampler import SortagradBatchSampler
from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler
from paddlespeech.s2t.models.u2 import U2Model
from paddlespeech.s2t.training.optimizer import OptimizerFactory
from paddlespeech.s2t.training.reporter import ObsScope
@ -215,7 +210,7 @@ class U2Trainer(Trainer):
msg += f"{v:>.8f}" if isinstance(v,
float) else f"{v}"
msg += f" {k.split(',')[1]}" if len(
k.split(',')) == 2 else f""
k.split(',')) == 2 else ""
msg += ","
msg = msg[:-1] # remove the last ","
if (batch_index + 1

@ -57,7 +57,7 @@ class TextFeaturizer():
vocab_filepath, maskctc)
self.vocab_size = len(self.vocab_list)
else:
logger.warning(f"TextFeaturizer: not have vocab file.")
logger.warning("TextFeaturizer: not have vocab file.")
if unit_type == 'spm':
spm_model = spm_model_prefix + '.model'

@ -341,7 +341,7 @@ class LogMelSpectrogramKaldi():
self.eps = eps
self.remove_dc_offset = True
self.preemph = 0.97
self.dither = dither # only work in train mode
self.dither = dither # only work in train mode
def __repr__(self):
return (

Loading…
Cancel
Save