diff --git a/data_utils/data.py b/data_utils/data.py index 159bf69d..14b02f99 100644 --- a/data_utils/data.py +++ b/data_utils/data.py @@ -91,7 +91,7 @@ class DataGenerator(object): :param transcript: Transcription text. :type transcript: basestring :return: Tuple of audio feature tensor and list of token ids for - transcription. + transcription. :rtype: tuple of (2darray, list) """ speech_segment = SpeechSegment.from_file(filename, transcript) @@ -111,7 +111,7 @@ class DataGenerator(object): """ Batch data reader creator for audio data. Return a callable generator function to produce batches of data. - + Audio features within one batch will be padded with zeros to have the same shape, or a user-defined shape. @@ -191,9 +191,9 @@ class DataGenerator(object): @property def feeding(self): """Returns data reader's feeding dict. - + :return: Data feeding dict. - :rtype: dict + :rtype: dict """ return {"audio_spectrogram": 0, "transcript_text": 1} diff --git a/data_utils/featurizer/text_featurizer.py b/data_utils/featurizer/text_featurizer.py index 4f9a49b5..89202163 100644 --- a/data_utils/featurizer/text_featurizer.py +++ b/data_utils/featurizer/text_featurizer.py @@ -4,6 +4,7 @@ from __future__ import division from __future__ import print_function import os +import codecs class TextFeaturizer(object): @@ -59,7 +60,7 @@ class TextFeaturizer(object): def _load_vocabulary_from_file(self, vocab_filepath): """Load vocabulary from file.""" vocab_lines = [] - with open(vocab_filepath, 'r') as file: + with codecs.open(vocab_filepath, 'r', 'utf-8') as file: vocab_lines.extend(file.readlines()) vocab_list = [line[:-1] for line in vocab_lines] vocab_dict = dict( diff --git a/data_utils/utils.py b/data_utils/utils.py index 3f116571..f970ff55 100644 --- a/data_utils/utils.py +++ b/data_utils/utils.py @@ -4,15 +4,16 @@ from __future__ import division from __future__ import print_function import json +import codecs def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): """Load and parse manifest file. - + Instances with durations outside [min_duration, max_duration] will be filtered out. - :param manifest_path: Manifest file to load and parse. + :param manifest_path: Manifest file to load and parse. :type manifest_path: basestring :param max_duration: Maximal duration in seconds for instance filter. :type max_duration: float @@ -23,7 +24,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): :raises IOError: If failed to parse the manifest. """ manifest = [] - for json_line in open(manifest_path): + for json_line in codecs.open(manifest_path, 'r', 'utf-8'): try: json_data = json.loads(json_line) except Exception as e: diff --git a/datasets/librispeech/librispeech.py b/datasets/librispeech/librispeech.py index 7e941f0e..d963a7d5 100644 --- a/datasets/librispeech/librispeech.py +++ b/datasets/librispeech/librispeech.py @@ -16,6 +16,7 @@ import tarfile import argparse import soundfile import json +import codecs from paddle.v2.dataset.common import md5file DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') @@ -112,7 +113,7 @@ def create_manifest(data_dir, manifest_path): 'duration': duration, 'text': text })) - with open(manifest_path, 'w') as out_file: + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: for line in json_lines: out_file.write(line + '\n') diff --git a/error_rate.py b/error_rate.py index 0cf17921..ea829f47 100644 --- a/error_rate.py +++ b/error_rate.py @@ -10,47 +10,54 @@ import numpy as np def _levenshtein_distance(ref, hyp): - """Levenshtein distance is a string metric for measuring the difference between - two sequences. Informally, the levenshtein disctance is defined as the minimum - number of single-character edits (substitutions, insertions or deletions) - required to change one word into the other. We can naturally extend the edits to - word level when calculate levenshtein disctance for two sentences. + """Levenshtein distance is a string metric for measuring the difference + between two sequences. Informally, the levenshtein disctance is defined as + the minimum number of single-character edits (substitutions, insertions or + deletions) required to change one word into the other. We can naturally + extend the edits to word level when calculate levenshtein disctance for + two sentences. """ - ref_len = len(ref) - hyp_len = len(hyp) + m = len(ref) + n = len(hyp) # special case if ref == hyp: return 0 - if ref_len == 0: - return hyp_len - if hyp_len == 0: - return ref_len + if m == 0: + return n + if n == 0: + return m - distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32) + if m < n: + ref, hyp = hyp, ref + m, n = n, m + + # use O(min(m, n)) space + distance = np.zeros((2, n + 1), dtype=np.int32) # initialize distance matrix - for j in xrange(hyp_len + 1): + for j in xrange(n + 1): distance[0][j] = j - for i in xrange(ref_len + 1): - distance[i][0] = i # calculate levenshtein distance - for i in xrange(1, ref_len + 1): - for j in xrange(1, hyp_len + 1): + for i in xrange(1, m + 1): + prev_row_idx = (i - 1) % 2 + cur_row_idx = i % 2 + distance[cur_row_idx][0] = i + for j in xrange(1, n + 1): if ref[i - 1] == hyp[j - 1]: - distance[i][j] = distance[i - 1][j - 1] + distance[cur_row_idx][j] = distance[prev_row_idx][j - 1] else: - s_num = distance[i - 1][j - 1] + 1 - i_num = distance[i][j - 1] + 1 - d_num = distance[i - 1][j] + 1 - distance[i][j] = min(s_num, i_num, d_num) + s_num = distance[prev_row_idx][j - 1] + 1 + i_num = distance[cur_row_idx][j - 1] + 1 + d_num = distance[prev_row_idx][j] + 1 + distance[cur_row_idx][j] = min(s_num, i_num, d_num) - return distance[ref_len][hyp_len] + return distance[m % 2][n] def wer(reference, hypothesis, ignore_case=False, delimiter=' '): - """Calculate word error rate (WER). WER compares reference text and + """Calculate word error rate (WER). WER compares reference text and hypothesis text in word-level. WER is defined as: .. math:: @@ -65,8 +72,8 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): Iw is the number of words inserted, Nw is the number of words in the reference - We can use levenshtein distance to calculate WER. Please draw an attention that - empty items will be removed when splitting sentences by delimiter. + We can use levenshtein distance to calculate WER. Please draw an attention + that empty items will be removed when splitting sentences by delimiter. :param reference: The reference sentence. :type reference: basestring @@ -95,7 +102,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '): return wer -def cer(reference, hypothesis, ignore_case=False): +def cer(reference, hypothesis, ignore_case=False, remove_space=False): """Calculate charactor error rate (CER). CER compares reference text and hypothesis text in char-level. CER is defined as: @@ -111,10 +118,10 @@ def cer(reference, hypothesis, ignore_case=False): Ic is the number of characters inserted Nc is the number of characters in the reference - We can use levenshtein distance to calculate CER. Chinese input should be - encoded to unicode. Please draw an attention that the leading and tailing - white space characters will be truncated and multiple consecutive white - space characters in a sentence will be replaced by one white space character. + We can use levenshtein distance to calculate CER. Chinese input should be + encoded to unicode. Please draw an attention that the leading and tailing + space characters will be truncated and multiple consecutive space + characters in a sentence will be replaced by one space character. :param reference: The reference sentence. :type reference: basestring @@ -122,6 +129,8 @@ def cer(reference, hypothesis, ignore_case=False): :type hypothesis: basestring :param ignore_case: Whether case-sensitive or not. :type ignore_case: bool + :param remove_space: Whether remove internal space characters + :type remove_space: bool :return: Character error rate. :rtype: float :raises ValueError: If the reference length is zero. @@ -130,8 +139,12 @@ def cer(reference, hypothesis, ignore_case=False): reference = reference.lower() hypothesis = hypothesis.lower() - reference = ' '.join(filter(None, reference.split(' '))) - hypothesis = ' '.join(filter(None, hypothesis.split(' '))) + join_char = ' ' + if remove_space == True: + join_char = '' + + reference = join_char.join(filter(None, reference.split(' '))) + hypothesis = join_char.join(filter(None, hypothesis.split(' '))) if len(reference) == 0: raise ValueError("Length of reference should be greater than 0.") diff --git a/tests/test_error_rate.py b/tests/test_error_rate.py index be7313f3..99e137a9 100644 --- a/tests/test_error_rate.py +++ b/tests/test_error_rate.py @@ -11,16 +11,54 @@ import error_rate class TestParse(unittest.TestCase): def test_wer_1(self): ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' - hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night' + hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last '\ + 'night' word_error_rate = error_rate.wer(ref, hyp) self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6) def test_wer_2(self): + ref = 'as any in england i would say said gamewell proudly that is '\ + 'in his day' + hyp = 'as any in england i would say said came well proudly that is '\ + 'in his day' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.1333333) < 1e-6) + + def test_wer_3(self): + ref = 'the lieutenant governor lilburn w boggs afterward governor '\ + 'was a pronounced mormon hater and throughout the period of '\ + 'the troubles he manifested sympathy with the persecutors' + hyp = 'the lieutenant governor little bit how bags afterward '\ + 'governor was a pronounced warman hater and throughout the '\ + 'period of th troubles he manifests sympathy with the '\ + 'persecutors' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.2692307692) < 1e-6) + + def test_wer_4(self): + ref = 'the wood flamed up splendidly under the large brewing copper '\ + 'and it sighed so deeply' + hyp = 'the wood flame do splendidly under the large brewing copper '\ + 'and its side so deeply' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.2666666667) < 1e-6) + + def test_wer_5(self): + ref = 'all the morning they trudged up the mountain path and at noon '\ + 'unc and ojo sat on a fallen tree trunk and ate the last of '\ + 'the bread which the old munchkin had placed in his pocket' + hyp = 'all the morning they trudged up the mountain path and at noon '\ + 'unc in ojo sat on a fallen tree trunk and ate the last of '\ + 'the bread which the old munchkin had placed in his pocket' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.027027027) < 1e-6) + + def test_wer_6(self): ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' word_error_rate = error_rate.wer(ref, ref) self.assertEqual(word_error_rate, 0.0) - def test_wer_3(self): + def test_wer_7(self): ref = ' ' hyp = 'Hypothesis sentence' with self.assertRaises(ValueError): @@ -33,22 +71,40 @@ class TestParse(unittest.TestCase): self.assertTrue(abs(char_error_rate - 0.25) < 1e-6) def test_cer_2(self): + ref = 'werewolf' + hyp = 'weae wolf' + char_error_rate = error_rate.cer(ref, hyp, remove_space=True) + self.assertTrue(abs(char_error_rate - 0.125) < 1e-6) + + def test_cer_3(self): + ref = 'were wolf' + hyp = 'were wolf' + char_error_rate = error_rate.cer(ref, hyp) + self.assertTrue(abs(char_error_rate - 0.0) < 1e-6) + + def test_cer_4(self): ref = 'werewolf' char_error_rate = error_rate.cer(ref, ref) self.assertEqual(char_error_rate, 0.0) - def test_cer_3(self): + def test_cer_5(self): ref = u'我是中国人' hyp = u'我是 美洲人' char_error_rate = error_rate.cer(ref, hyp) self.assertTrue(abs(char_error_rate - 0.6) < 1e-6) - def test_cer_4(self): + def test_cer_6(self): + ref = u'我 是 中 国 人' + hyp = u'我 是 美 洲 人' + char_error_rate = error_rate.cer(ref, hyp, remove_space=True) + self.assertTrue(abs(char_error_rate - 0.4) < 1e-6) + + def test_cer_7(self): ref = u'我是中国人' char_error_rate = error_rate.cer(ref, ref) self.assertFalse(char_error_rate, 0.0) - def test_cer_5(self): + def test_cer_8(self): ref = '' hyp = 'Hypothesis' with self.assertRaises(ValueError):