Merge pull request #747 from LittleChenCc/develop

refine the code and correct yaml
3 years ago · 5e8e46ed79
parent 566f636cc6 45e71a0a64
commit 5e8e46ed79
4 changed files with 14 additions and 10 deletions
--- a/deepspeech/io/collator_st.py
+++ b/deepspeech/io/collator_st.py
@ -563,7 +563,7 @@ class KaldiPrePorocessedCollator(SpeechCollator):
    @property
    def feature_size(self):
        return self._feat_dim
-    
+
    @property
    def stride_ms(self):
        return self._stride_ms
--- a/deepspeech/utils/bleu_score.py
+++ b/deepspeech/utils/bleu_score.py
@ -35,6 +35,7 @@ def bleu(hypothesis, reference):

    return sacrebleu.corpus_bleu(hypothesis, reference)

+
 def char_bleu(hypothesis, reference):
    """Calculate BLEU. BLEU compares reference text and
    hypothesis text in char-level using scarebleu.
@ -47,7 +48,8 @@ def char_bleu(hypothesis, reference):
    :type hypothesis: list[str]
    :raises ValueError: If the reference number is zero.
    """
-    hypothesis =[' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis]
-    reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref ]for ref in reference ]
+    hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis]
+    reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref]
+                 for ref in reference]

-    return sacrebleu.corpus_bleu(hypothesis, reference)
+    return sacrebleu.corpus_bleu(hypothesis, reference)
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@ -44,9 +44,11 @@ def create_manifest(data_dir, manifest_path_prefix):
    print("Creating manifest %s ..." % manifest_path_prefix)
    json_lines = []

-    data_types_infos = [('train', 'train-split/train-segment', 'En-Zh/train.en-zh'), 
-                ('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'), 
-                ('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh')]
+    data_types_infos = [
+        ('train', 'train-split/train-segment', 'En-Zh/train.en-zh'),
+        ('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'),
+        ('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh')
+    ]
    for data_info in data_types_infos:
        dtype, audio_relative_dir, text_relative_path = data_info
        del json_lines[:]
@ -63,7 +65,7 @@ def create_manifest(data_dir, manifest_path_prefix):
                continue
            audio_id, trancription, translation = line.split('\t')
            utt = audio_id.split('.')[0]
-            
+
            audio_path = os.path.join(audio_dir, audio_id)
            if os.path.exists(audio_path):
                if os.path.getsize(audio_path) < 30000:
--- a/examples/ted_en_zh/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/conf/transformer_joint_noam.yaml
@ -3,8 +3,8 @@ data:
  train_manifest: data/manifest.train
  dev_manifest: data/manifest.dev
  test_manifest: data/manifest.test
-  min_input_len: 0.5  # second
-  max_input_len: 3000.0 # second
+  min_input_len: 0.05  # second
+  max_input_len: 30.0 # second
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.01