Unify encoding to 'utf-8' and optimize error rate calculation.

pull/2/head
yangyaming 8 years ago
parent 7e39debcb0
commit 14d2fb795c

@ -4,6 +4,7 @@ from __future__ import division
from __future__ import print_function
import os
import codecs
class TextFeaturizer(object):
@ -59,7 +60,7 @@ class TextFeaturizer(object):
def _load_vocabulary_from_file(self, vocab_filepath):
"""Load vocabulary from file."""
vocab_lines = []
with open(vocab_filepath, 'r') as file:
with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
vocab_lines.extend(file.readlines())
vocab_list = [line[:-1] for line in vocab_lines]
vocab_dict = dict(

@ -4,6 +4,7 @@ from __future__ import division
from __future__ import print_function
import json
import codecs
def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
@ -23,7 +24,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
:raises IOError: If failed to parse the manifest.
"""
manifest = []
for json_line in open(manifest_path):
for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
try:
json_data = json.loads(json_line)
except Exception as e:

@ -17,6 +17,7 @@ import argparse
import soundfile
import json
from paddle.v2.dataset.common import md5file
import codecs
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
@ -112,7 +113,7 @@ def create_manifest(data_dir, manifest_path):
'duration': duration,
'text': text
}))
with open(manifest_path, 'w') as out_file:
with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
for line in json_lines:
out_file.write(line + '\n')

@ -10,43 +10,48 @@ import numpy as np
def _levenshtein_distance(ref, hyp):
"""Levenshtein distance is a string metric for measuring the difference between
two sequences. Informally, the levenshtein disctance is defined as the minimum
number of single-character edits (substitutions, insertions or deletions)
required to change one word into the other. We can naturally extend the edits to
word level when calculate levenshtein disctance for two sentences.
"""Levenshtein distance is a string metric for measuring the difference
between two sequences. Informally, the levenshtein disctance is defined as
the minimum number of single-character edits (substitutions, insertions or
deletions) required to change one word into the other. We can naturally
extend the edits to word level when calculate levenshtein disctance for
two sentences.
"""
ref_len = len(ref)
hyp_len = len(hyp)
m = len(ref)
n = len(hyp)
# special case
if ref == hyp:
return 0
if ref_len == 0:
return hyp_len
if hyp_len == 0:
return ref_len
if m == 0:
return n
if n == 0:
return m
distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32)
if m < n:
ref, hyp = hyp, ref
m, n = n, m
# use O(min(m, n)) space
distance = np.zeros((2, n + 1), dtype=np.int32)
# initialize distance matrix
for j in xrange(hyp_len + 1):
for j in xrange(n + 1):
distance[0][j] = j
for i in xrange(ref_len + 1):
distance[i][0] = i
# calculate levenshtein distance
for i in xrange(1, ref_len + 1):
for j in xrange(1, hyp_len + 1):
for i in xrange(1, m + 1):
distance[i % 2][0] = i
for j in xrange(1, n + 1):
if ref[i - 1] == hyp[j - 1]:
distance[i][j] = distance[i - 1][j - 1]
distance[i % 2][j] = distance[(i - 1) % 2][j - 1]
else:
s_num = distance[i - 1][j - 1] + 1
i_num = distance[i][j - 1] + 1
d_num = distance[i - 1][j] + 1
distance[i][j] = min(s_num, i_num, d_num)
s_num = distance[(i - 1) % 2][j - 1] + 1
i_num = distance[i % 2][j - 1] + 1
d_num = distance[(i - 1) % 2][j] + 1
distance[i % 2][j] = min(s_num, i_num, d_num)
return distance[ref_len][hyp_len]
return distance[m % 2][n]
def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
@ -65,8 +70,8 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
Iw is the number of words inserted,
Nw is the number of words in the reference
We can use levenshtein distance to calculate WER. Please draw an attention that
empty items will be removed when splitting sentences by delimiter.
We can use levenshtein distance to calculate WER. Please draw an attention
that empty items will be removed when splitting sentences by delimiter.
:param reference: The reference sentence.
:type reference: basestring
@ -95,7 +100,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
return wer
def cer(reference, hypothesis, ignore_case=False):
def cer(reference, hypothesis, ignore_case=False, remove_space=False):
"""Calculate charactor error rate (CER). CER compares reference text and
hypothesis text in char-level. CER is defined as:
@ -113,8 +118,8 @@ def cer(reference, hypothesis, ignore_case=False):
We can use levenshtein distance to calculate CER. Chinese input should be
encoded to unicode. Please draw an attention that the leading and tailing
white space characters will be truncated and multiple consecutive white
space characters in a sentence will be replaced by one white space character.
space characters will be truncated and multiple consecutive space
characters in a sentence will be replaced by one space character.
:param reference: The reference sentence.
:type reference: basestring
@ -122,6 +127,8 @@ def cer(reference, hypothesis, ignore_case=False):
:type hypothesis: basestring
:param ignore_case: Whether case-sensitive or not.
:type ignore_case: bool
:param remove_space: Whether remove internal space characters
:type remove_space: bool
:return: Character error rate.
:rtype: float
:raises ValueError: If the reference length is zero.
@ -130,8 +137,12 @@ def cer(reference, hypothesis, ignore_case=False):
reference = reference.lower()
hypothesis = hypothesis.lower()
reference = ' '.join(filter(None, reference.split(' ')))
hypothesis = ' '.join(filter(None, hypothesis.split(' ')))
join_char = ' '
if remove_space == True:
join_char = ''
reference = join_char.join(filter(None, reference.split(' ')))
hypothesis = join_char.join(filter(None, hypothesis.split(' ')))
if len(reference) == 0:
raise ValueError("Length of reference should be greater than 0.")

@ -33,22 +33,34 @@ class TestParse(unittest.TestCase):
self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
def test_cer_2(self):
ref = 'werewolf'
hyp = 'weae wolf'
char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
def test_cer_3(self):
ref = 'werewolf'
char_error_rate = error_rate.cer(ref, ref)
self.assertEqual(char_error_rate, 0.0)
def test_cer_3(self):
def test_cer_4(self):
ref = u'我是中国人'
hyp = u'我是 美洲人'
char_error_rate = error_rate.cer(ref, hyp)
self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)
def test_cer_4(self):
def test_cer_5(self):
ref = u'我 是 中 国 人'
hyp = u'我 是 美 洲 人'
char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
self.assertTrue(abs(char_error_rate - 0.4) < 1e-6)
def test_cer_6(self):
ref = u'我是中国人'
char_error_rate = error_rate.cer(ref, ref)
self.assertFalse(char_error_rate, 0.0)
def test_cer_5(self):
def test_cer_7(self):
ref = ''
hyp = 'Hypothesis'
with self.assertRaises(ValueError):

Loading…
Cancel
Save