From d55e6b5a0a58212c13169ba1d3297f9431b62b6b Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 23 Jun 2021 03:16:49 +0000 Subject: [PATCH 1/6] revise from_pretrained function --- deepspeech/exps/deepspeech2/bin/deploy/runtime.py | 10 +++++++++- deepspeech/exps/deepspeech2/bin/deploy/server.py | 10 +++++++++- deepspeech/exps/deepspeech2/bin/tune.py | 2 +- deepspeech/exps/deepspeech2/model.py | 2 +- deepspeech/exps/u2/model.py | 2 +- deepspeech/models/deepspeech2.py | 8 ++++---- deepspeech/models/u2.py | 8 ++++---- 7 files changed, 29 insertions(+), 13 deletions(-) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index f3125e04..0ec36b5d 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -29,6 +29,9 @@ from deepspeech.utils.socket_server import warm_up_test from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments +from paddle.io import DataLoader +from deepspeech.io.collator import SpeechCollator + def init_predictor(args): if args.model_dir is not None: @@ -83,7 +86,12 @@ def start_server(config, args): config.data.keep_transcription_text = True dataset = ManifestDataset.from_config(config) - model = DeepSpeech2Model.from_pretrained(dataset, config, + config.collator.batch_size=1 + config.collator.num_workers=0 + collate_fn = SpeechCollator.from_config(config) + test_loader = DataLoader(dataset_dataset, collate_fn=collate_fn, num_workers=0) + + model = DeepSpeech2Model.from_pretrained(test_loader, config, args.checkpoint_path) model.eval() diff --git a/deepspeech/exps/deepspeech2/bin/deploy/server.py b/deepspeech/exps/deepspeech2/bin/deploy/server.py index b2ff37e0..40ba4c72 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/server.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py @@ -28,6 +28,9 @@ from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments +from paddle.io import DataLoader +from deepspeech.io.collator import SpeechCollator + def start_server(config, args): """Start the ASR server""" config.defrost() @@ -36,7 +39,12 @@ def start_server(config, args): config.data.keep_transcription_text = True dataset = ManifestDataset.from_config(config) - model = DeepSpeech2Model.from_pretrained(dataset, config, + config.collator.batch_size=1 + config.collator.num_workers=0 + collate_fn = SpeechCollator.from_config(config) + test_loader = DataLoader(dataset_dataset, collate_fn=collate_fn, num_workers=0) + + model = DeepSpeech2Model.from_pretrained(test_loader, config, args.checkpoint_path) model.eval() diff --git a/deepspeech/exps/deepspeech2/bin/tune.py b/deepspeech/exps/deepspeech2/bin/tune.py index 02e329a1..f10dc27c 100644 --- a/deepspeech/exps/deepspeech2/bin/tune.py +++ b/deepspeech/exps/deepspeech2/bin/tune.py @@ -47,7 +47,7 @@ def tune(config, args): drop_last=False, collate_fn=SpeechCollator(keep_transcription_text=True)) - model = DeepSpeech2Model.from_pretrained(dev_dataset, config, + model = DeepSpeech2Model.from_pretrained(valid_loader, config, args.checkpoint_path) model.eval() diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index deb8752b..209e8b02 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -318,7 +318,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): def export(self): infer_model = DeepSpeech2InferModel.from_pretrained( - self.test_loader.dataset, self.config, self.args.checkpoint_path) + self.test_loader, self.config, self.args.checkpoint_path) infer_model.eval() feat_dim = self.test_loader.collate_fn.feature_size static_model = paddle.jit.to_static( diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 05551875..308569cd 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -506,7 +506,7 @@ class U2Tester(U2Trainer): List[paddle.static.InputSpec]: input spec. """ from deepspeech.models.u2 import U2InferModel - infer_model = U2InferModel.from_pretrained(self.test_loader.dataset, + infer_model = U2InferModel.from_pretrained(self.test_loader, self.config.model.clone(), self.args.checkpoint_path) feat_dim = self.test_loader.collate_fn.feature_size diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py index 0ff5514d..d2c03a18 100644 --- a/deepspeech/models/deepspeech2.py +++ b/deepspeech/models/deepspeech2.py @@ -198,11 +198,11 @@ class DeepSpeech2Model(nn.Layer): cutoff_top_n, num_processes) @classmethod - def from_pretrained(cls, dataset, config, checkpoint_path): + def from_pretrained(cls, dataloader, config, checkpoint_path): """Build a DeepSpeech2Model model from a pretrained model. Parameters ---------- - dataset: paddle.io.Dataset + dataloader: paddle.io.DataLoader config: yacs.config.CfgNode model configs @@ -215,8 +215,8 @@ class DeepSpeech2Model(nn.Layer): DeepSpeech2Model The model built from pretrained result. """ - model = cls(feat_size=dataset.feature_size, - dict_size=dataset.vocab_size, + model = cls(feat_size=dataloader.collate_fn.feature_size, + dict_size=dataloader.collate_fn.vocab_size, num_conv_layers=config.model.num_conv_layers, num_rnn_layers=config.model.num_rnn_layers, rnn_size=config.model.rnn_layer_size, diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index 238e2d35..23ae3423 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -876,11 +876,11 @@ class U2Model(U2BaseModel): return model @classmethod - def from_pretrained(cls, dataset, config, checkpoint_path): + def from_pretrained(cls, dataloader, config, checkpoint_path): """Build a DeepSpeech2Model model from a pretrained model. Args: - dataset (paddle.io.Dataset): not used. + dataloader (paddle.io.DataLoader): not used. config (yacs.config.CfgNode): model configs checkpoint_path (Path or str): the path of pretrained model checkpoint, without extension name @@ -888,8 +888,8 @@ class U2Model(U2BaseModel): DeepSpeech2Model: The model built from pretrained result. """ config.defrost() - config.input_dim = dataset.feature_size - config.output_dim = dataset.vocab_size + config.input_dim = dataloader.collate_fn.feature_size + config.output_dim = dataloader.collate_fn.vocab_size config.freeze() model = cls.from_config(config) From c753b9ddf2b321caf873187bd7a498fb61d4bf0a Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 23 Jun 2021 09:05:34 +0000 Subject: [PATCH 2/6] fix runtime.py and server.py --- .../exps/deepspeech2/bin/deploy/runtime.py | 20 ++++++++--------- .../exps/deepspeech2/bin/deploy/server.py | 22 +++++++++++-------- deepspeech/io/collator.py | 3 ++- deepspeech/utils/socket_server.py | 4 ++-- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index 0ec36b5d..26365820 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -81,15 +81,15 @@ def inference(config, args): def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manfiest = config.data.test_manifest - config.data.augmentation_config = "" - config.data.keep_transcription_text = True + config.data.manifest = config.data.test_manifest dataset = ManifestDataset.from_config(config) - + + config.collator.augmentation_config = "" + config.collator.keep_transcription_text = True config.collator.batch_size=1 config.collator.num_workers=0 collate_fn = SpeechCollator.from_config(config) - test_loader = DataLoader(dataset_dataset, collate_fn=collate_fn, num_workers=0) + test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) model = DeepSpeech2Model.from_pretrained(test_loader, config, args.checkpoint_path) @@ -97,15 +97,15 @@ def start_server(config, args): # prepare ASR inference handler def file_to_transcript(filename): - feature = dataset.process_utterance(filename, "") - audio = np.array([feature[0]]).astype('float32') #[1, D, T] - audio_len = feature[0].shape[1] + feature = collate_fn.process_utterance(filename, "") + audio = np.array([feature[0]]).astype('float32') #[1, T, D] + audio_len = feature[0].shape[0] audio_len = np.array([audio_len]).astype('int64') # [1] result_transcript = model.decode( paddle.to_tensor(audio), paddle.to_tensor(audio_len), - vocab_list=dataset.vocab_list, + vocab_list=test_loader.collate_fn.vocab_list, decoding_method=config.decoding.decoding_method, lang_model_path=config.decoding.lang_model_path, beam_alpha=config.decoding.alpha, @@ -146,7 +146,7 @@ if __name__ == "__main__": add_arg('host_ip', str, 'localhost', "Server's IP address.") - add_arg('host_port', int, 8086, "Server's IP port.") + add_arg('host_port', int, 8089, "Server's IP port.") add_arg('speech_save_dir', str, 'demo_cache', "Directory to save demo audios.") diff --git a/deepspeech/exps/deepspeech2/bin/deploy/server.py b/deepspeech/exps/deepspeech2/bin/deploy/server.py index 40ba4c72..73a3fc17 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/server.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py @@ -34,15 +34,15 @@ from deepspeech.io.collator import SpeechCollator def start_server(config, args): """Start the ASR server""" config.defrost() - config.data.manfiest = config.data.test_manifest - config.data.augmentation_config = "" - config.data.keep_transcription_text = True + config.data.manifest = config.data.test_manifest dataset = ManifestDataset.from_config(config) + config.collator.augmentation_config = "" + config.collator.keep_transcription_text = True config.collator.batch_size=1 config.collator.num_workers=0 collate_fn = SpeechCollator.from_config(config) - test_loader = DataLoader(dataset_dataset, collate_fn=collate_fn, num_workers=0) + test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) model = DeepSpeech2Model.from_pretrained(test_loader, config, args.checkpoint_path) @@ -50,15 +50,19 @@ def start_server(config, args): # prepare ASR inference handler def file_to_transcript(filename): - feature = dataset.process_utterance(filename, "") - audio = np.array([feature[0]]).astype('float32') #[1, D, T] - audio_len = feature[0].shape[1] + feature = test_loader.collate_fn.process_utterance(filename, "") + audio = np.array([feature[0]]).astype('float32') #[1, T, D] + # audio = audio.swapaxes(1,2) + print('---file_to_transcript feature----') + print(audio.shape) + audio_len = feature[0].shape[0] + print(audio_len) audio_len = np.array([audio_len]).astype('int64') # [1] result_transcript = model.decode( paddle.to_tensor(audio), paddle.to_tensor(audio_len), - vocab_list=dataset.vocab_list, + vocab_list=test_loader.collate_fn.vocab_list, decoding_method=config.decoding.decoding_method, lang_model_path=config.decoding.lang_model_path, beam_alpha=config.decoding.alpha, @@ -99,7 +103,7 @@ if __name__ == "__main__": add_arg('host_ip', str, 'localhost', "Server's IP address.") - add_arg('host_port', int, 8086, "Server's IP port.") + add_arg('host_port', int, 8088, "Server's IP port.") add_arg('speech_save_dir', str, 'demo_cache', "Directory to save demo audios.") diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 1061f97c..94264d6f 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -242,6 +242,7 @@ class SpeechCollator(): # specgram augment specgram = self._augmentation_pipeline.transform_feature(specgram) + specgram=specgram.transpose([1,0]) return specgram, transcript_part def __call__(self, batch): @@ -269,7 +270,7 @@ class SpeechCollator(): #utt utts.append(utt) # audio - audios.append(audio.T) # [T, D] + audios.append(audio) # [T, D] audio_lens.append(audio.shape[1]) # text # for training, text is token ids diff --git a/deepspeech/utils/socket_server.py b/deepspeech/utils/socket_server.py index adcbf3bb..8fd7c2fa 100644 --- a/deepspeech/utils/socket_server.py +++ b/deepspeech/utils/socket_server.py @@ -48,9 +48,9 @@ def warm_up_test(audio_process_handler, rng = random.Random(random_seed) samples = rng.sample(manifest, num_test_cases) for idx, sample in enumerate(samples): - print("Warm-up Test Case %d: %s", idx, sample['audio_filepath']) + print("Warm-up Test Case %d: %s"%(idx, sample['feat'])) start_time = time.time() - transcript = audio_process_handler(sample['audio_filepath']) + transcript = audio_process_handler(sample['feat']) finish_time = time.time() print("Response Time: %f, Transcript: %s" % (finish_time - start_time, transcript)) From 340e622953ba941c3e7ae75bbb59e45018a0d1a5 Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Wed, 23 Jun 2021 09:14:56 +0000 Subject: [PATCH 3/6] fix runtime and server --- deepspeech/exps/deepspeech2/bin/deploy/runtime.py | 11 +++++------ deepspeech/exps/deepspeech2/bin/deploy/server.py | 9 ++++----- deepspeech/io/collator.py | 2 +- deepspeech/utils/socket_server.py | 2 +- 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index 26365820..dad8459e 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -18,8 +18,10 @@ import numpy as np import paddle from paddle.inference import Config from paddle.inference import create_predictor +from paddle.io import DataLoader from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.training.cli import default_argument_parser @@ -29,9 +31,6 @@ from deepspeech.utils.socket_server import warm_up_test from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments -from paddle.io import DataLoader -from deepspeech.io.collator import SpeechCollator - def init_predictor(args): if args.model_dir is not None: @@ -83,11 +82,11 @@ def start_server(config, args): config.defrost() config.data.manifest = config.data.test_manifest dataset = ManifestDataset.from_config(config) - + config.collator.augmentation_config = "" config.collator.keep_transcription_text = True - config.collator.batch_size=1 - config.collator.num_workers=0 + config.collator.batch_size = 1 + config.collator.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/server.py b/deepspeech/exps/deepspeech2/bin/deploy/server.py index 73a3fc17..b473a8fd 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/server.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py @@ -16,8 +16,10 @@ import functools import numpy as np import paddle +from paddle.io import DataLoader from deepspeech.exps.deepspeech2.config import get_cfg_defaults +from deepspeech.io.collator import SpeechCollator from deepspeech.io.dataset import ManifestDataset from deepspeech.models.deepspeech2 import DeepSpeech2Model from deepspeech.training.cli import default_argument_parser @@ -28,9 +30,6 @@ from deepspeech.utils.utility import add_arguments from deepspeech.utils.utility import print_arguments -from paddle.io import DataLoader -from deepspeech.io.collator import SpeechCollator - def start_server(config, args): """Start the ASR server""" config.defrost() @@ -39,8 +38,8 @@ def start_server(config, args): config.collator.augmentation_config = "" config.collator.keep_transcription_text = True - config.collator.batch_size=1 - config.collator.num_workers=0 + config.collator.batch_size = 1 + config.collator.num_workers = 0 collate_fn = SpeechCollator.from_config(config) test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0) diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 94264d6f..305ca940 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -242,7 +242,7 @@ class SpeechCollator(): # specgram augment specgram = self._augmentation_pipeline.transform_feature(specgram) - specgram=specgram.transpose([1,0]) + specgram = specgram.transpose([1, 0]) return specgram, transcript_part def __call__(self, batch): diff --git a/deepspeech/utils/socket_server.py b/deepspeech/utils/socket_server.py index 8fd7c2fa..45c659f6 100644 --- a/deepspeech/utils/socket_server.py +++ b/deepspeech/utils/socket_server.py @@ -48,7 +48,7 @@ def warm_up_test(audio_process_handler, rng = random.Random(random_seed) samples = rng.sample(manifest, num_test_cases) for idx, sample in enumerate(samples): - print("Warm-up Test Case %d: %s"%(idx, sample['feat'])) + print("Warm-up Test Case %d: %s" % (idx, sample['feat'])) start_time = time.time() transcript = audio_process_handler(sample['feat']) finish_time = time.time() From 4b80b172d3b163b196868965510709a7b96c93ad Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Thu, 24 Jun 2021 02:32:34 +0000 Subject: [PATCH 4/6] add model params --- examples/aishell/s0/README.md | 14 +++++++------- examples/aishell/s1/README.md | 24 ++++++++++++------------ 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/examples/aishell/s0/README.md b/examples/aishell/s0/README.md index ae3fb401..c2588845 100644 --- a/examples/aishell/s0/README.md +++ b/examples/aishell/s0/README.md @@ -2,10 +2,10 @@ ## Deepspeech2 -| Model | release | Config | Test set | Loss | CER | -| --- | --- | --- | --- | --- | --- | -| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | -| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | -| DeepSpeech2 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | -| DeepSpeech2 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | -| DeepSpeech2 58.4M | 1.8.5 | - | test | - | 0.080447 | +| Model | Params | Release | Config | Test set | Loss | CER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 | +| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 | +| DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 | +| DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 | +| DeepSpeech2 | 58.4M | 1.8.5 | - | test | - | 0.080447 | diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md index 601b0a8d..72a03b61 100644 --- a/examples/aishell/s1/README.md +++ b/examples/aishell/s1/README.md @@ -2,21 +2,21 @@ ## Conformer -| Model | Config | Augmentation| Test set | Decode method | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | -| conformer | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | +| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.06M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 | +| conformer | 47.06M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 | +| conformer | 47.06M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 | +| conformer | 47.06M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 | ## Chunk Conformer -| Model | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | -| --- | --- | --- | --- | --- | --- | --- | --- | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | -| conformer | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | +| Model | Params | Config | Augmentation| Test set | Decode method | Chunk | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16 | - | 0.061939 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16 | - | 0.070806 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16 | - | 0.070739 | +| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16 | - | 0.059400 | ## Transformer From 019ae4b35c2f713e17e69f2f8a8bd6b199642b0e Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Fri, 25 Jun 2021 03:13:40 +0000 Subject: [PATCH 5/6] fix conf for ds2 --- examples/aishell/s0/conf/deepspeech2.yaml | 3 +-- examples/aishell/s0/run.sh | 4 ++-- examples/librispeech/s0/README.md | 10 +++++----- examples/librispeech/s0/conf/deepspeech2.yaml | 13 +++++++++---- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 8cc4c4c9..1004fde0 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -10,8 +10,8 @@ data: min_output_input_ratio: 0.00 max_output_input_ratio: .inf - collator: + batch_size: 64 # one gpu mean_std_filepath: data/mean_std.json unit_type: char vocab_filepath: data/vocab.txt @@ -33,7 +33,6 @@ collator: sortagrad: True shuffle_method: batch_shuffle num_workers: 0 - batch_size: 64 # one gpu model: num_conv_layers: 2 diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh index 05829136..c9708dcc 100755 --- a/examples/aishell/s0/run.sh +++ b/examples/aishell/s0/run.sh @@ -31,10 +31,10 @@ fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 + CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then # export ckpt avg_n - CUDA_VISIBLE_DEVICES=${gpus} ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit + CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit fi diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/s0/README.md index 393dd457..dde288bd 100644 --- a/examples/librispeech/s0/README.md +++ b/examples/librispeech/s0/README.md @@ -2,8 +2,8 @@ ## Deepspeech2 -| Model | release | Config | Test set | Loss | WER | -| --- | --- | --- | --- | --- | --- | -| DeepSpeech2 | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | -| DeepSpeech2 | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | -| DeepSpeech2 | 1.8.5 | - | test-clean | - | 0.074939 | +| Model | Params | Release | Config | Test set | Loss | WER | +| --- | --- | --- | --- | --- | --- | --- | +| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | 15.184467315673828 | test-clean | 0.072154 | +| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | - | test-clean | 0.073973 | +| DeepSpeech2 | 42.96M | 1.8.5 | - | test-clean | - | 0.074939 | diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index d1746bff..b419cbe2 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -3,16 +3,21 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev-clean test_manifest: data/manifest.test-clean - mean_std_filepath: data/mean_std.json - vocab_filepath: data/vocab.txt - augmentation_config: conf/augmentation.json - batch_size: 20 min_input_len: 0.0 max_input_len: 27.0 # second min_output_len: 0.0 max_output_len: .inf min_output_input_ratio: 0.00 max_output_input_ratio: .inf + +collator: + batch_size: 20 + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt + augmentation_config: conf/augmentation.json + random_seed: 0 + spm_model_prefix: specgram_type: linear target_sample_rate: 16000 max_freq: None From 3965dbc2c33661fda86c1f29b5a5afbeddeb653c Mon Sep 17 00:00:00 2001 From: Haoxin Ma <745165806@qq.com> Date: Fri, 25 Jun 2021 05:23:44 +0000 Subject: [PATCH 6/6] runtime.py --- deepspeech/exps/deepspeech2/bin/deploy/runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py index dad8459e..01f01b65 100644 --- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py +++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py @@ -96,7 +96,7 @@ def start_server(config, args): # prepare ASR inference handler def file_to_transcript(filename): - feature = collate_fn.process_utterance(filename, "") + feature = test_loader.collate_fn.process_utterance(filename, "") audio = np.array([feature[0]]).astype('float32') #[1, T, D] audio_len = feature[0].shape[0] audio_len = np.array([audio_len]).astype('int64') # [1]