Merge pull request #190 from pkuyym/unify_encoding

Unify encoding to 'utf-8' and optimize error rate calculation.
pull/2/head
Yang yaming 7 years ago committed by GitHub
commit 8c5103725b

@ -91,7 +91,7 @@ class DataGenerator(object):
:param transcript: Transcription text.
:type transcript: basestring
:return: Tuple of audio feature tensor and list of token ids for
transcription.
transcription.
:rtype: tuple of (2darray, list)
"""
speech_segment = SpeechSegment.from_file(filename, transcript)
@ -111,7 +111,7 @@ class DataGenerator(object):
"""
Batch data reader creator for audio data. Return a callable generator
function to produce batches of data.
Audio features within one batch will be padded with zeros to have the
same shape, or a user-defined shape.
@ -191,9 +191,9 @@ class DataGenerator(object):
@property
def feeding(self):
"""Returns data reader's feeding dict.
:return: Data feeding dict.
:rtype: dict
:rtype: dict
"""
return {"audio_spectrogram": 0, "transcript_text": 1}

@ -4,6 +4,7 @@ from __future__ import division
from __future__ import print_function
import os
import codecs
class TextFeaturizer(object):
@ -59,7 +60,7 @@ class TextFeaturizer(object):
def _load_vocabulary_from_file(self, vocab_filepath):
"""Load vocabulary from file."""
vocab_lines = []
with open(vocab_filepath, 'r') as file:
with codecs.open(vocab_filepath, 'r', 'utf-8') as file:
vocab_lines.extend(file.readlines())
vocab_list = [line[:-1] for line in vocab_lines]
vocab_dict = dict(

@ -4,15 +4,16 @@ from __future__ import division
from __future__ import print_function
import json
import codecs
def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
"""Load and parse manifest file.
Instances with durations outside [min_duration, max_duration] will be
filtered out.
:param manifest_path: Manifest file to load and parse.
:param manifest_path: Manifest file to load and parse.
:type manifest_path: basestring
:param max_duration: Maximal duration in seconds for instance filter.
:type max_duration: float
@ -23,7 +24,7 @@ def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0):
:raises IOError: If failed to parse the manifest.
"""
manifest = []
for json_line in open(manifest_path):
for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
try:
json_data = json.loads(json_line)
except Exception as e:

@ -16,6 +16,7 @@ import tarfile
import argparse
import soundfile
import json
import codecs
from paddle.v2.dataset.common import md5file
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
@ -112,7 +113,7 @@ def create_manifest(data_dir, manifest_path):
'duration': duration,
'text': text
}))
with open(manifest_path, 'w') as out_file:
with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
for line in json_lines:
out_file.write(line + '\n')

@ -10,47 +10,54 @@ import numpy as np
def _levenshtein_distance(ref, hyp):
"""Levenshtein distance is a string metric for measuring the difference between
two sequences. Informally, the levenshtein disctance is defined as the minimum
number of single-character edits (substitutions, insertions or deletions)
required to change one word into the other. We can naturally extend the edits to
word level when calculate levenshtein disctance for two sentences.
"""Levenshtein distance is a string metric for measuring the difference
between two sequences. Informally, the levenshtein disctance is defined as
the minimum number of single-character edits (substitutions, insertions or
deletions) required to change one word into the other. We can naturally
extend the edits to word level when calculate levenshtein disctance for
two sentences.
"""
ref_len = len(ref)
hyp_len = len(hyp)
m = len(ref)
n = len(hyp)
# special case
if ref == hyp:
return 0
if ref_len == 0:
return hyp_len
if hyp_len == 0:
return ref_len
if m == 0:
return n
if n == 0:
return m
distance = np.zeros((ref_len + 1, hyp_len + 1), dtype=np.int32)
if m < n:
ref, hyp = hyp, ref
m, n = n, m
# use O(min(m, n)) space
distance = np.zeros((2, n + 1), dtype=np.int32)
# initialize distance matrix
for j in xrange(hyp_len + 1):
for j in xrange(n + 1):
distance[0][j] = j
for i in xrange(ref_len + 1):
distance[i][0] = i
# calculate levenshtein distance
for i in xrange(1, ref_len + 1):
for j in xrange(1, hyp_len + 1):
for i in xrange(1, m + 1):
prev_row_idx = (i - 1) % 2
cur_row_idx = i % 2
distance[cur_row_idx][0] = i
for j in xrange(1, n + 1):
if ref[i - 1] == hyp[j - 1]:
distance[i][j] = distance[i - 1][j - 1]
distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
else:
s_num = distance[i - 1][j - 1] + 1
i_num = distance[i][j - 1] + 1
d_num = distance[i - 1][j] + 1
distance[i][j] = min(s_num, i_num, d_num)
s_num = distance[prev_row_idx][j - 1] + 1
i_num = distance[cur_row_idx][j - 1] + 1
d_num = distance[prev_row_idx][j] + 1
distance[cur_row_idx][j] = min(s_num, i_num, d_num)
return distance[ref_len][hyp_len]
return distance[m % 2][n]
def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
"""Calculate word error rate (WER). WER compares reference text and
"""Calculate word error rate (WER). WER compares reference text and
hypothesis text in word-level. WER is defined as:
.. math::
@ -65,8 +72,8 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
Iw is the number of words inserted,
Nw is the number of words in the reference
We can use levenshtein distance to calculate WER. Please draw an attention that
empty items will be removed when splitting sentences by delimiter.
We can use levenshtein distance to calculate WER. Please draw an attention
that empty items will be removed when splitting sentences by delimiter.
:param reference: The reference sentence.
:type reference: basestring
@ -95,7 +102,7 @@ def wer(reference, hypothesis, ignore_case=False, delimiter=' '):
return wer
def cer(reference, hypothesis, ignore_case=False):
def cer(reference, hypothesis, ignore_case=False, remove_space=False):
"""Calculate charactor error rate (CER). CER compares reference text and
hypothesis text in char-level. CER is defined as:
@ -111,10 +118,10 @@ def cer(reference, hypothesis, ignore_case=False):
Ic is the number of characters inserted
Nc is the number of characters in the reference
We can use levenshtein distance to calculate CER. Chinese input should be
encoded to unicode. Please draw an attention that the leading and tailing
white space characters will be truncated and multiple consecutive white
space characters in a sentence will be replaced by one white space character.
We can use levenshtein distance to calculate CER. Chinese input should be
encoded to unicode. Please draw an attention that the leading and tailing
space characters will be truncated and multiple consecutive space
characters in a sentence will be replaced by one space character.
:param reference: The reference sentence.
:type reference: basestring
@ -122,6 +129,8 @@ def cer(reference, hypothesis, ignore_case=False):
:type hypothesis: basestring
:param ignore_case: Whether case-sensitive or not.
:type ignore_case: bool
:param remove_space: Whether remove internal space characters
:type remove_space: bool
:return: Character error rate.
:rtype: float
:raises ValueError: If the reference length is zero.
@ -130,8 +139,12 @@ def cer(reference, hypothesis, ignore_case=False):
reference = reference.lower()
hypothesis = hypothesis.lower()
reference = ' '.join(filter(None, reference.split(' ')))
hypothesis = ' '.join(filter(None, hypothesis.split(' ')))
join_char = ' '
if remove_space == True:
join_char = ''
reference = join_char.join(filter(None, reference.split(' ')))
hypothesis = join_char.join(filter(None, hypothesis.split(' ')))
if len(reference) == 0:
raise ValueError("Length of reference should be greater than 0.")

@ -11,16 +11,54 @@ import error_rate
class TestParse(unittest.TestCase):
def test_wer_1(self):
ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last night'
hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last '\
'night'
word_error_rate = error_rate.wer(ref, hyp)
self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6)
def test_wer_2(self):
ref = 'as any in england i would say said gamewell proudly that is '\
'in his day'
hyp = 'as any in england i would say said came well proudly that is '\
'in his day'
word_error_rate = error_rate.wer(ref, hyp)
self.assertTrue(abs(word_error_rate - 0.1333333) < 1e-6)
def test_wer_3(self):
ref = 'the lieutenant governor lilburn w boggs afterward governor '\
'was a pronounced mormon hater and throughout the period of '\
'the troubles he manifested sympathy with the persecutors'
hyp = 'the lieutenant governor little bit how bags afterward '\
'governor was a pronounced warman hater and throughout the '\
'period of th troubles he manifests sympathy with the '\
'persecutors'
word_error_rate = error_rate.wer(ref, hyp)
self.assertTrue(abs(word_error_rate - 0.2692307692) < 1e-6)
def test_wer_4(self):
ref = 'the wood flamed up splendidly under the large brewing copper '\
'and it sighed so deeply'
hyp = 'the wood flame do splendidly under the large brewing copper '\
'and its side so deeply'
word_error_rate = error_rate.wer(ref, hyp)
self.assertTrue(abs(word_error_rate - 0.2666666667) < 1e-6)
def test_wer_5(self):
ref = 'all the morning they trudged up the mountain path and at noon '\
'unc and ojo sat on a fallen tree trunk and ate the last of '\
'the bread which the old munchkin had placed in his pocket'
hyp = 'all the morning they trudged up the mountain path and at noon '\
'unc in ojo sat on a fallen tree trunk and ate the last of '\
'the bread which the old munchkin had placed in his pocket'
word_error_rate = error_rate.wer(ref, hyp)
self.assertTrue(abs(word_error_rate - 0.027027027) < 1e-6)
def test_wer_6(self):
ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night'
word_error_rate = error_rate.wer(ref, ref)
self.assertEqual(word_error_rate, 0.0)
def test_wer_3(self):
def test_wer_7(self):
ref = ' '
hyp = 'Hypothesis sentence'
with self.assertRaises(ValueError):
@ -33,22 +71,40 @@ class TestParse(unittest.TestCase):
self.assertTrue(abs(char_error_rate - 0.25) < 1e-6)
def test_cer_2(self):
ref = 'werewolf'
hyp = 'weae wolf'
char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
self.assertTrue(abs(char_error_rate - 0.125) < 1e-6)
def test_cer_3(self):
ref = 'were wolf'
hyp = 'were wolf'
char_error_rate = error_rate.cer(ref, hyp)
self.assertTrue(abs(char_error_rate - 0.0) < 1e-6)
def test_cer_4(self):
ref = 'werewolf'
char_error_rate = error_rate.cer(ref, ref)
self.assertEqual(char_error_rate, 0.0)
def test_cer_3(self):
def test_cer_5(self):
ref = u'我是中国人'
hyp = u'我是 美洲人'
char_error_rate = error_rate.cer(ref, hyp)
self.assertTrue(abs(char_error_rate - 0.6) < 1e-6)
def test_cer_4(self):
def test_cer_6(self):
ref = u'我 是 中 国 人'
hyp = u'我 是 美 洲 人'
char_error_rate = error_rate.cer(ref, hyp, remove_space=True)
self.assertTrue(abs(char_error_rate - 0.4) < 1e-6)
def test_cer_7(self):
ref = u'我是中国人'
char_error_rate = error_rate.cer(ref, ref)
self.assertFalse(char_error_rate, 0.0)
def test_cer_5(self):
def test_cer_8(self):
ref = ''
hyp = 'Hypothesis'
with self.assertRaises(ValueError):

Loading…
Cancel
Save