Unify encoding to 'utf-8' and optimize error rate calculation.

9 years ago · 14d2fb795c
parent 7e39debcb0
commit 14d2fb795c
6 changed files with 71 additions and 45 deletions
--- a/data_utils/data.py
+++ b/data_utils/data.py
@ -91,7 +91,7 @@ class DataGenerator(object):
        :param transcript: Transcription text.
        :type transcript: basestring
        :return: Tuple of audio feature tensor and list of token ids for
-                 transcription. 
+                 transcription.
        :rtype: tuple of (2darray, list)
        """
        speech_segment = SpeechSegment.from_file(filename, transcript)
@ -111,7 +111,7 @@ class DataGenerator(object):
        """
        Batch data reader creator for audio data. Return a callable generator
        function to produce batches of data.
-        
+
        Audio features within one batch will be padded with zeros to have the
        same shape, or a user-defined shape.

@ -191,9 +191,9 @@ class DataGenerator(object):
    @property
    def feeding(self):
        """Returns data reader's feeding dict.
-        
+
        :return: Data feeding dict.
-        :rtype: dict 
+        :rtype: dict
        """
        return {"audio_spectrogram": 0, "transcript_text": 1}

--- a/data_utils/featurizer/text_featurizer.py
+++ b/data_utils/featurizer/text_featurizer.py
@ -4,6 +4,7 @@ from __future__ import division
 from __future__ import print_function

 import os
+import codecs


 class TextFeaturizer(object):
@ -59,7 +60,7 @@ class TextFeaturizer(object):
    def _load_vocabulary_from_file(self, vocab_filepath):
        """Load vocabulary from file."""
        vocab_lines = []
-        with open(vocab_filepath, 'r') as file:
+        with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        vocab_dict = dict(
--- a/data_utils/utils.py
+++ b/data_utils/utils.py
@ -4,15 +4,16 @@ from __future__ import division
 from __future__ import print_function

 import json
+import codecs


 def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
    """Load and parse manifest file.
-    
+
    Instances with durations outside [min_duration, max_duration] will be
    filtered out.

-    :param manifest_path: Manifest file to load and parse. 
+    :param manifest_path: Manifest file to load and parse.
    :type manifest_path: basestring
    :param max_duration: Maximal duration in seconds for instance filter.
    :type max_duration: float
@ -23,7 +24,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
    :raises IOError: If failed to parse the manifest.
    """
    manifest = []
-    for json_line in open(manifest_path):
+    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
        try:
            json_data = json.loads(json_line)
        except Exception as e:
--- a/datasets/librispeech/librispeech.py
+++ b/datasets/librispeech/librispeech.py
@ -17,6 +17,7 @@ import argparse
 import soundfile
 import json
 from paddle.v2.dataset.common import md5file
+import codecs

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

@ -112,7 +113,7 @@ def create_manifest(data_dir, manifest_path):
                        'duration': duration,
                        'text': text
                    }))
-    with open(manifest_path, 'w') as out_file:
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
        for line in json_lines:
            out_file.write(line + '\n')

--- a/error_rate.py
+++ b/error_rate.py
@ -10,47 +10,52 @@ import numpy as np


 def _levenshtein_distance(ref, hyp):
-    """Levenshtein distance is a string metric for measuring the difference between
-    two sequences. Informally, the levenshtein disctance is defined as the minimum
-    number of single-character edits (substitutions, insertions or deletions) 
-    required to change one word into the other. We can naturally extend the edits to 
-    word level when calculate levenshtein disctance for two sentences.
+    """Levenshtein distance is a string metric for measuring the difference
+    between two sequences. Informally, the levenshtein disctance is defined as
+    the minimum number of single-character edits (substitutions, insertions or
+    deletions) required to change one word into the other. We can naturally
+    extend the edits to word level when calculate levenshtein disctance for
+    two sentences.
    """
-    ref_len = len(ref)
-    hyp_len = len(hyp)
+    m = len(ref)
+    n = len(hyp)

    # special case
    if ref == hyp:
        return 0
-    if ref_len == 0:
-        return hyp_len
-    if hyp_len == 0:
-        return ref_len
+    if m == 0:
+        return n
+    if n == 0:
+        return m

-    distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32)
+    if m < n:
+        ref, hyp = hyp, ref
+        m, n = n, m
+
+    # use O(min(m, n)) space
+    distance = np.zeros((2, n + 1), dtype=np.int32)

    # initialize distance matrix
-    for j in xrange(hyp_len + 1):
+    for j in xrange(n + 1):
        distance[0][j] = j
-    for i in xrange(ref_len + 1):
-        distance[i][0] = i

    # calculate levenshtein distance
-    for i in xrange(1, ref_len + 1):
-        for j in xrange(1, hyp_len + 1):
+    for i in xrange(1, m + 1):
+        distance[i % 2][0] = i
+        for j in xrange(1, n + 1):
            if ref[i - 1] == hyp[j - 1]:
-                distance[i][j] = distance[i - 1][j - 1]
+                distance[i % 2][j] = distance[(i - 1) % 2][j - 1]
            else:
-                s_num = distance[i - 1][j - 1] + 1
-                i_num = distance[i][j - 1] + 1
-                d_num = distance[i - 1][j] + 1
-                distance[i][j] = min(s_num, i_num, d_num)
+                s_num = distance[(i - 1) % 2][j - 1] + 1
+                i_num = distance[i % 2][j - 1] + 1
+                d_num = distance[(i - 1) % 2][j] + 1
+                distance[i % 2][j] = min(s_num, i_num, d_num)

-    return distance[ref_len][hyp_len]
+    return distance[m % 2][n]


 def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
-    """Calculate word error rate (WER). WER compares reference text and 
+    """Calculate word error rate (WER). WER compares reference text and
    hypothesis text in word-level. WER is defined as:

    .. math::
@ -65,8 +70,8 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
        Iw is the number of words inserted,
        Nw is the number of words in the reference

-    We can use levenshtein distance to calculate WER. Please draw an attention that 
-    empty items will be removed when splitting sentences by delimiter.
+    We can use levenshtein distance to calculate WER. Please draw an attention
+    that empty items will be removed when splitting sentences by delimiter.

    :param reference: The reference sentence.
    :type reference: basestring
@ -95,7 +100,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
    return wer


-def cer(reference, hypothesis, ignore_case=False):
+def cer(reference, hypothesis, ignore_case=False, remove_space=False):
    """Calculate charactor error rate (CER). CER compares reference text and
    hypothesis text in char-level. CER is defined as:

@ -111,10 +116,10 @@ def cer(reference, hypothesis, ignore_case=False):
        Ic is the number of characters inserted
        Nc is the number of characters in the reference

-    We can use levenshtein distance to calculate CER. Chinese input should be 
-    encoded to unicode. Please draw an attention that the leading and tailing 
-    white space characters will be truncated and multiple consecutive white 
-    space characters in a sentence will be replaced by one white space character.
+    We can use levenshtein distance to calculate CER. Chinese input should be
+    encoded to unicode. Please draw an attention that the leading and tailing
+    space characters will be truncated and multiple consecutive space
+    characters in a sentence will be replaced by one space character.

    :param reference: The reference sentence.
    :type reference: basestring
@ -122,6 +127,8 @@ def cer(reference, hypothesis, ignore_case=False):
    :type hypothesis: basestring
    :param ignore_case: Whether case-sensitive or not.
    :type ignore_case: bool
+    :param remove_space: Whether remove internal space characters
+    :type remove_space: bool
    :return: Character error rate.
    :rtype: float
    :raises ValueError: If the reference length is zero.
@ -130,8 +137,12 @@ def cer(reference, hypothesis, ignore_case=False):
        reference = reference.lower()
        hypothesis = hypothesis.lower()

-    reference = ' '.join(filter(None, reference.split(' ')))
-    hypothesis = ' '.join(filter(None, hypothesis.split(' ')))
+    join_char = ' '
+    if remove_space == True:
+        join_char = ''
+
+    reference = join_char.join(filter(None, reference.split(' ')))
+    hypothesis = join_char.join(filter(None, hypothesis.split(' ')))

    if len(reference) == 0:
        raise ValueError("Length of reference should be greater than 0.")
--- a/tests/test_error_rate.py
+++ b/tests/test_error_rate.py
@ -33,22 +33,34 @@ class TestParse(unittest.TestCase):
        self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)

    def test_cer_2(self):
+        ref = 'werewolf'
+        hyp = 'weae  wolf'
+        char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
+        self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
+
+    def test_cer_3(self):
        ref = 'werewolf'
        char_error_rate = error_rate.cer(ref, ref)
        self.assertEqual(char_error_rate, 0.0)

-    def test_cer_3(self):
+    def test_cer_4(self):
        ref = u'我是中国人'
        hyp = u'我是 美洲人'
        char_error_rate = error_rate.cer(ref, hyp)
        self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)

-    def test_cer_4(self):
+    def test_cer_5(self):
+        ref = u'我 是 中 国 人'
+        hyp = u'我 是 美 洲 人'
+        char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
+        self.assertTrue(abs(char_error_rate - 0.4) < 1e-6)
+
+    def test_cer_6(self):
        ref = u'我是中国人'
        char_error_rate = error_rate.cer(ref, ref)
        self.assertFalse(char_error_rate, 0.0)

-    def test_cer_5(self):
+    def test_cer_7(self):
        ref = ''
        hyp = 'Hypothesis'
        with self.assertRaises(ValueError):