Unify encoding to 'utf-8' and optimize error rate calculation.

pull/2/head
yangyaming 8 years ago
parent 7e39debcb0
commit 14d2fb795c

@ -91,7 +91,7 @@ class DataGenerator(object):
:param transcript: Transcription text. :param transcript: Transcription text.
:type transcript: basestring :type transcript: basestring
:return: Tuple of audio feature tensor and list of token ids for :return: Tuple of audio feature tensor and list of token ids for
transcription. transcription.
:rtype: tuple of (2darray, list) :rtype: tuple of (2darray, list)
""" """
speech_segment = SpeechSegment.from_file(filename, transcript) speech_segment = SpeechSegment.from_file(filename, transcript)
@ -111,7 +111,7 @@ class DataGenerator(object):
""" """
Batch data reader creator for audio data. Return a callable generator Batch data reader creator for audio data. Return a callable generator
function to produce batches of data. function to produce batches of data.
Audio features within one batch will be padded with zeros to have the Audio features within one batch will be padded with zeros to have the
same shape, or a user-defined shape. same shape, or a user-defined shape.
@ -191,9 +191,9 @@ class DataGenerator(object):
@property @property
def feeding(self): def feeding(self):
"""Returns data reader's feeding dict. """Returns data reader's feeding dict.
:return: Data feeding dict. :return: Data feeding dict.
:rtype: dict :rtype: dict
""" """
return {"audio_spectrogram": 0, "transcript_text": 1} return {"audio_spectrogram": 0, "transcript_text": 1}

@ -4,6 +4,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import os import os
import codecs
class TextFeaturizer(object): class TextFeaturizer(object):
@ -59,7 +60,7 @@ class TextFeaturizer(object):
def _load_vocabulary_from_file(self, vocab_filepath): def _load_vocabulary_from_file(self, vocab_filepath):
"""Load vocabulary from file.""" """Load vocabulary from file."""
vocab_lines = [] vocab_lines = []
with open(vocab_filepath, 'r') as file: with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
vocab_lines.extend(file.readlines()) vocab_lines.extend(file.readlines())
vocab_list = [line[:-1] for line in vocab_lines] vocab_list = [line[:-1] for line in vocab_lines]
vocab_dict = dict( vocab_dict = dict(

@ -4,15 +4,16 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import json import json
import codecs
def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
"""Load and parse manifest file. """Load and parse manifest file.
Instances with durations outside [min_duration, max_duration] will be Instances with durations outside [min_duration, max_duration] will be
filtered out. filtered out.
:param manifest_path: Manifest file to load and parse. :param manifest_path: Manifest file to load and parse.
:type manifest_path: basestring :type manifest_path: basestring
:param max_duration: Maximal duration in seconds for instance filter. :param max_duration: Maximal duration in seconds for instance filter.
:type max_duration: float :type max_duration: float
@ -23,7 +24,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
:raises IOError: If failed to parse the manifest. :raises IOError: If failed to parse the manifest.
""" """
manifest = [] manifest = []
for json_line in open(manifest_path): for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
try: try:
json_data = json.loads(json_line) json_data = json.loads(json_line)
except Exception as e: except Exception as e:

@ -17,6 +17,7 @@ import argparse
import soundfile import soundfile
import json import json
from paddle.v2.dataset.common import md5file from paddle.v2.dataset.common import md5file
import codecs
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
@ -112,7 +113,7 @@ def create_manifest(data_dir, manifest_path):
'duration': duration, 'duration': duration,
'text': text 'text': text
})) }))
with open(manifest_path, 'w') as out_file: with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
for line in json_lines: for line in json_lines:
out_file.write(line + '\n') out_file.write(line + '\n')

@ -10,47 +10,52 @@ import numpy as np
def _levenshtein_distance(ref, hyp): def _levenshtein_distance(ref, hyp):
"""Levenshtein distance is a string metric for measuring the difference between """Levenshtein distance is a string metric for measuring the difference
two sequences. Informally, the levenshtein disctance is defined as the minimum between two sequences. Informally, the levenshtein disctance is defined as
number of single-character edits (substitutions, insertions or deletions) the minimum number of single-character edits (substitutions, insertions or
required to change one word into the other. We can naturally extend the edits to deletions) required to change one word into the other. We can naturally
word level when calculate levenshtein disctance for two sentences. extend the edits to word level when calculate levenshtein disctance for
two sentences.
""" """
ref_len = len(ref) m = len(ref)
hyp_len = len(hyp) n = len(hyp)
# special case # special case
if ref == hyp: if ref == hyp:
return 0 return 0
if ref_len == 0: if m == 0:
return hyp_len return n
if hyp_len == 0: if n == 0:
return ref_len return m
distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32) if m < n:
ref, hyp = hyp, ref
m, n = n, m
# use O(min(m, n)) space
distance = np.zeros((2, n + 1), dtype=np.int32)
# initialize distance matrix # initialize distance matrix
for j in xrange(hyp_len + 1): for j in xrange(n + 1):
distance[0][j] = j distance[0][j] = j
for i in xrange(ref_len + 1):
distance[i][0] = i
# calculate levenshtein distance # calculate levenshtein distance
for i in xrange(1, ref_len + 1): for i in xrange(1, m + 1):
for j in xrange(1, hyp_len + 1): distance[i % 2][0] = i
for j in xrange(1, n + 1):
if ref[i - 1] == hyp[j - 1]: if ref[i - 1] == hyp[j - 1]:
distance[i][j] = distance[i - 1][j - 1] distance[i % 2][j] = distance[(i - 1) % 2][j - 1]
else: else:
s_num = distance[i - 1][j - 1] + 1 s_num = distance[(i - 1) % 2][j - 1] + 1
i_num = distance[i][j - 1] + 1 i_num = distance[i % 2][j - 1] + 1
d_num = distance[i - 1][j] + 1 d_num = distance[(i - 1) % 2][j] + 1
distance[i][j] = min(s_num, i_num, d_num) distance[i % 2][j] = min(s_num, i_num, d_num)
return distance[ref_len][hyp_len] return distance[m % 2][n]
def wer(reference, hypothesis, ignore_case=False, delimiter=' '): def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
"""Calculate word error rate (WER). WER compares reference text and """Calculate word error rate (WER). WER compares reference text and
hypothesis text in word-level. WER is defined as: hypothesis text in word-level. WER is defined as:
.. math:: .. math::
@ -65,8 +70,8 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
Iw is the number of words inserted, Iw is the number of words inserted,
Nw is the number of words in the reference Nw is the number of words in the reference
We can use levenshtein distance to calculate WER. Please draw an attention that We can use levenshtein distance to calculate WER. Please draw an attention
empty items will be removed when splitting sentences by delimiter. that empty items will be removed when splitting sentences by delimiter.
:param reference: The reference sentence. :param reference: The reference sentence.
:type reference: basestring :type reference: basestring
@ -95,7 +100,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
return wer return wer
def cer(reference, hypothesis, ignore_case=False): def cer(reference, hypothesis, ignore_case=False, remove_space=False):
"""Calculate charactor error rate (CER). CER compares reference text and """Calculate charactor error rate (CER). CER compares reference text and
hypothesis text in char-level. CER is defined as: hypothesis text in char-level. CER is defined as:
@ -111,10 +116,10 @@ def cer(reference, hypothesis, ignore_case=False):
Ic is the number of characters inserted Ic is the number of characters inserted
Nc is the number of characters in the reference Nc is the number of characters in the reference
We can use levenshtein distance to calculate CER. Chinese input should be We can use levenshtein distance to calculate CER. Chinese input should be
encoded to unicode. Please draw an attention that the leading and tailing encoded to unicode. Please draw an attention that the leading and tailing
white space characters will be truncated and multiple consecutive white space characters will be truncated and multiple consecutive space
space characters in a sentence will be replaced by one white space character. characters in a sentence will be replaced by one space character.
:param reference: The reference sentence. :param reference: The reference sentence.
:type reference: basestring :type reference: basestring
@ -122,6 +127,8 @@ def cer(reference, hypothesis, ignore_case=False):
:type hypothesis: basestring :type hypothesis: basestring
:param ignore_case: Whether case-sensitive or not. :param ignore_case: Whether case-sensitive or not.
:type ignore_case: bool :type ignore_case: bool
:param remove_space: Whether remove internal space characters
:type remove_space: bool
:return: Character error rate. :return: Character error rate.
:rtype: float :rtype: float
:raises ValueError: If the reference length is zero. :raises ValueError: If the reference length is zero.
@ -130,8 +137,12 @@ def cer(reference, hypothesis, ignore_case=False):
reference = reference.lower() reference = reference.lower()
hypothesis = hypothesis.lower() hypothesis = hypothesis.lower()
reference = ' '.join(filter(None, reference.split(' '))) join_char = ' '
hypothesis = ' '.join(filter(None, hypothesis.split(' '))) if remove_space == True:
join_char = ''
reference = join_char.join(filter(None, reference.split(' ')))
hypothesis = join_char.join(filter(None, hypothesis.split(' ')))
if len(reference) == 0: if len(reference) == 0:
raise ValueError("Length of reference should be greater than 0.") raise ValueError("Length of reference should be greater than 0.")

@ -33,22 +33,34 @@ class TestParse(unittest.TestCase):
self.assertTrue(abs(char_error_rate - 0.25) < 1e-6) self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
def test_cer_2(self): def test_cer_2(self):
ref = 'werewolf'
hyp = 'weae wolf'
char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
def test_cer_3(self):
ref = 'werewolf' ref = 'werewolf'
char_error_rate = error_rate.cer(ref, ref) char_error_rate = error_rate.cer(ref, ref)
self.assertEqual(char_error_rate, 0.0) self.assertEqual(char_error_rate, 0.0)
def test_cer_3(self): def test_cer_4(self):
ref = u'我是中国人' ref = u'我是中国人'
hyp = u'我是 美洲人' hyp = u'我是 美洲人'
char_error_rate = error_rate.cer(ref, hyp) char_error_rate = error_rate.cer(ref, hyp)
self.assertTrue(abs(char_error_rate - 0.6) < 1e-6) self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)
def test_cer_4(self): def test_cer_5(self):
ref = u'我 是 中 国 人'
hyp = u'我 是 美 洲 人'
char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
self.assertTrue(abs(char_error_rate - 0.4) < 1e-6)
def test_cer_6(self):
ref = u'我是中国人' ref = u'我是中国人'
char_error_rate = error_rate.cer(ref, ref) char_error_rate = error_rate.cer(ref, ref)
self.assertFalse(char_error_rate, 0.0) self.assertFalse(char_error_rate, 0.0)
def test_cer_5(self): def test_cer_7(self):
ref = '' ref = ''
hyp = 'Hypothesis' hyp = 'Hypothesis'
with self.assertRaises(ValueError): with self.assertRaises(ValueError):

Loading…
Cancel
Save