fix dev & test dataset filter

pull/578/head
Hui Zhang 4 years ago
parent b69021f9e6
commit 3a2c722d22

@ -144,6 +144,12 @@ class DeepSpeech2Trainer(Trainer):
config.data.manifest = config.data.dev_manifest config.data.manifest = config.data.dev_manifest
config.data.augmentation_config = "" config.data.augmentation_config = ""
config.data.min_input_len = 0.0 # second
config.data.max_input_len: 100.0 # second
config.data.min_output_len: 0.0 # tokens
config.data.max_output_len: 400.0 # tokens
config.data.min_output_input_ratio: 0.00
config.data.max_output_input_ratio: 100.0
dev_dataset = ManifestDataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
if self.parallel: if self.parallel:
@ -320,9 +326,15 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
config.defrost() config.defrost()
# return raw text # return raw text
config.data.manifest = config.data.test_manifest
config.data.keep_transcription_text = True config.data.keep_transcription_text = True
config.data.augmentation_config = "" config.data.augmentation_config = ""
config.data.manifest = config.data.test_manifest config.data.min_input_len = 0.0 # second
config.data.max_input_len: 100.0 # second
config.data.min_output_len: 0.0 # tokens
config.data.max_output_len: 400.0 # tokens
config.data.min_output_input_ratio: 0.00
config.data.max_output_input_ratio: 100.0
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
# return text ord id # return text ord id

@ -215,8 +215,14 @@ class U2Trainer(Trainer):
config.data.manifest = config.data.train_manifest config.data.manifest = config.data.train_manifest
train_dataset = ManifestDataset.from_config(config) train_dataset = ManifestDataset.from_config(config)
config.data.manifest = config.data.dev_manifest
config.data.augmentation_config = "" config.data.augmentation_config = ""
config.data.min_input_len = 0.0 # second
config.data.max_input_len: 100.0 # second
config.data.min_output_len: 0.0 # tokens
config.data.max_output_len: 400.0 # tokens
config.data.min_output_input_ratio: 0.00
config.data.max_output_input_ratio: 100.0
config.data.manifest = config.data.dev_manifest
dev_dataset = ManifestDataset.from_config(config) dev_dataset = ManifestDataset.from_config(config)
collate_fn = SpeechCollator(keep_transcription_text=False) collate_fn = SpeechCollator(keep_transcription_text=False)
@ -253,6 +259,12 @@ class U2Trainer(Trainer):
# test dataset, return raw text # test dataset, return raw text
config.data.keep_transcription_text = True config.data.keep_transcription_text = True
config.data.augmentation_config = "" config.data.augmentation_config = ""
config.data.min_input_len = 0.0 # second
config.data.max_input_len: 100.0 # second
config.data.min_output_len: 0.0 # tokens
config.data.max_output_len: 400.0 # tokens
config.data.min_output_input_ratio: 0.00
config.data.max_output_input_ratio: 100.0
config.data.manifest = config.data.test_manifest config.data.manifest = config.data.test_manifest
test_dataset = ManifestDataset.from_config(config) test_dataset = ManifestDataset.from_config(config)
# return text ord id # return text ord id

@ -27,6 +27,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
mv data/manifest.${set} data/manifest.${set}.raw mv data/manifest.${set} data/manifest.${set}.raw
done done
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
for set in train-clean-100 train-clean-360 train-other-500; do for set in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${set}.raw >> data/manifest.train.raw cat data/manifest.${set}.raw >> data/manifest.train.raw
done done

@ -31,6 +31,7 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
mv data/manifest.${set} data/manifest.${set}.raw mv data/manifest.${set} data/manifest.${set}.raw
done done
rm -rf data/manifest.train.raw data/manifest.dev.raw data/manifest.test.raw
for set in train-clean-100 train-clean-360 train-other-500; do for set in train-clean-100 train-clean-360 train-other-500; do
cat data/manifest.${set}.raw >> data/manifest.train.raw cat data/manifest.${set}.raw >> data/manifest.train.raw
done done

Loading…
Cancel
Save