|
|
@ -1,15 +1,18 @@
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# CopyRight WeNet Apache-2.0 License
|
|
|
|
# CopyRight WeNet Apache-2.0 License
|
|
|
|
|
|
|
|
|
|
|
|
import re, sys, unicodedata
|
|
|
|
|
|
|
|
import codecs
|
|
|
|
import codecs
|
|
|
|
|
|
|
|
import re
|
|
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import unicodedata
|
|
|
|
|
|
|
|
|
|
|
|
remove_tag = True
|
|
|
|
remove_tag = True
|
|
|
|
spacelist = [' ', '\t', '\r', '\n']
|
|
|
|
spacelist = [' ', '\t', '\r', '\n']
|
|
|
|
puncts = ['!', ',', '?',
|
|
|
|
puncts = [
|
|
|
|
'、', '。', '!', ',', ';', '?',
|
|
|
|
'!', ',', '?', '、', '。', '!', ',', ';', '?', ':', '「', '」', '︰', '『', '』',
|
|
|
|
':', '「', '」', '︰', '『', '』', '《', '》']
|
|
|
|
'《', '》'
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def characterize(string):
|
|
|
|
def characterize(string):
|
|
|
|
res = []
|
|
|
|
res = []
|
|
|
@ -43,10 +46,12 @@ def characterize(string) :
|
|
|
|
i = j
|
|
|
|
i = j
|
|
|
|
return res
|
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def stripoff_tags(x):
|
|
|
|
def stripoff_tags(x):
|
|
|
|
if not x: return ''
|
|
|
|
if not x: return ''
|
|
|
|
chars = []
|
|
|
|
chars = []
|
|
|
|
i = 0; T=len(x)
|
|
|
|
i = 0
|
|
|
|
|
|
|
|
T = len(x)
|
|
|
|
while i < T:
|
|
|
|
while i < T:
|
|
|
|
if x[i] == '<':
|
|
|
|
if x[i] == '<':
|
|
|
|
while i < T and x[i] != '>':
|
|
|
|
while i < T and x[i] != '>':
|
|
|
@ -78,6 +83,7 @@ def normalize(sentence, ignore_words, cs, split=None):
|
|
|
|
new_sentence.append(x)
|
|
|
|
new_sentence.append(x)
|
|
|
|
return new_sentence
|
|
|
|
return new_sentence
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Calculator:
|
|
|
|
class Calculator:
|
|
|
|
def __init__(self):
|
|
|
|
def __init__(self):
|
|
|
|
self.data = {}
|
|
|
|
self.data = {}
|
|
|
@ -87,6 +93,7 @@ class Calculator :
|
|
|
|
self.cost['sub'] = 1
|
|
|
|
self.cost['sub'] = 1
|
|
|
|
self.cost['del'] = 1
|
|
|
|
self.cost['del'] = 1
|
|
|
|
self.cost['ins'] = 1
|
|
|
|
self.cost['ins'] = 1
|
|
|
|
|
|
|
|
|
|
|
|
def calculate(self, lab, rec):
|
|
|
|
def calculate(self, lab, rec):
|
|
|
|
# Initialization
|
|
|
|
# Initialization
|
|
|
|
lab.insert(0, '')
|
|
|
|
lab.insert(0, '')
|
|
|
@ -108,10 +115,22 @@ class Calculator :
|
|
|
|
self.space[0][0]['error'] = 'non'
|
|
|
|
self.space[0][0]['error'] = 'non'
|
|
|
|
for token in lab:
|
|
|
|
for token in lab:
|
|
|
|
if token not in self.data and len(token) > 0:
|
|
|
|
if token not in self.data and len(token) > 0:
|
|
|
|
self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0}
|
|
|
|
self.data[token] = {
|
|
|
|
|
|
|
|
'all': 0,
|
|
|
|
|
|
|
|
'cor': 0,
|
|
|
|
|
|
|
|
'sub': 0,
|
|
|
|
|
|
|
|
'ins': 0,
|
|
|
|
|
|
|
|
'del': 0
|
|
|
|
|
|
|
|
}
|
|
|
|
for token in rec:
|
|
|
|
for token in rec:
|
|
|
|
if token not in self.data and len(token) > 0:
|
|
|
|
if token not in self.data and len(token) > 0:
|
|
|
|
self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0}
|
|
|
|
self.data[token] = {
|
|
|
|
|
|
|
|
'all': 0,
|
|
|
|
|
|
|
|
'cor': 0,
|
|
|
|
|
|
|
|
'sub': 0,
|
|
|
|
|
|
|
|
'ins': 0,
|
|
|
|
|
|
|
|
'del': 0
|
|
|
|
|
|
|
|
}
|
|
|
|
# Computing edit distance
|
|
|
|
# Computing edit distance
|
|
|
|
for i, lab_token in enumerate(lab):
|
|
|
|
for i, lab_token in enumerate(lab):
|
|
|
|
for j, rec_token in enumerate(rec):
|
|
|
|
for j, rec_token in enumerate(rec):
|
|
|
@ -141,7 +160,15 @@ class Calculator :
|
|
|
|
self.space[i][j]['dist'] = min_dist
|
|
|
|
self.space[i][j]['dist'] = min_dist
|
|
|
|
self.space[i][j]['error'] = min_error
|
|
|
|
self.space[i][j]['error'] = min_error
|
|
|
|
# Tracing back
|
|
|
|
# Tracing back
|
|
|
|
result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0}
|
|
|
|
result = {
|
|
|
|
|
|
|
|
'lab': [],
|
|
|
|
|
|
|
|
'rec': [],
|
|
|
|
|
|
|
|
'all': 0,
|
|
|
|
|
|
|
|
'cor': 0,
|
|
|
|
|
|
|
|
'sub': 0,
|
|
|
|
|
|
|
|
'ins': 0,
|
|
|
|
|
|
|
|
'del': 0
|
|
|
|
|
|
|
|
}
|
|
|
|
i = len(lab) - 1
|
|
|
|
i = len(lab) - 1
|
|
|
|
j = len(rec) - 1
|
|
|
|
j = len(rec) - 1
|
|
|
|
while True:
|
|
|
|
while True:
|
|
|
@ -184,8 +211,11 @@ class Calculator :
|
|
|
|
elif self.space[i][j]['error'] == 'non': # starting point
|
|
|
|
elif self.space[i][j]['error'] == 'non': # starting point
|
|
|
|
break
|
|
|
|
break
|
|
|
|
else: # shouldn't reach here
|
|
|
|
else: # shouldn't reach here
|
|
|
|
print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error']))
|
|
|
|
print(
|
|
|
|
|
|
|
|
'this should not happen , i = {i} , j = {j} , error = {error}'.
|
|
|
|
|
|
|
|
format(i=i, j=j, error=self.space[i][j]['error']))
|
|
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def overall(self):
|
|
|
|
def overall(self):
|
|
|
|
result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
|
|
|
|
result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
|
|
|
|
for token in self.data:
|
|
|
|
for token in self.data:
|
|
|
@ -195,6 +225,7 @@ class Calculator :
|
|
|
|
result['ins'] = result['ins'] + self.data[token]['ins']
|
|
|
|
result['ins'] = result['ins'] + self.data[token]['ins']
|
|
|
|
result['del'] = result['del'] + self.data[token]['del']
|
|
|
|
result['del'] = result['del'] + self.data[token]['del']
|
|
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def cluster(self, data):
|
|
|
|
def cluster(self, data):
|
|
|
|
result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
|
|
|
|
result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
|
|
|
|
for token in data:
|
|
|
|
for token in data:
|
|
|
@ -205,12 +236,15 @@ class Calculator :
|
|
|
|
result['ins'] = result['ins'] + self.data[token]['ins']
|
|
|
|
result['ins'] = result['ins'] + self.data[token]['ins']
|
|
|
|
result['del'] = result['del'] + self.data[token]['del']
|
|
|
|
result['del'] = result['del'] + self.data[token]['del']
|
|
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
def keys(self):
|
|
|
|
def keys(self):
|
|
|
|
return list(self.data.keys())
|
|
|
|
return list(self.data.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def width(string):
|
|
|
|
def width(string):
|
|
|
|
return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
|
|
|
|
return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def default_cluster(word):
|
|
|
|
def default_cluster(word):
|
|
|
|
unicode_names = [unicodedata.name(char) for char in word]
|
|
|
|
unicode_names = [unicodedata.name(char) for char in word]
|
|
|
|
for i in reversed(range(len(unicode_names))):
|
|
|
|
for i in reversed(range(len(unicode_names))):
|
|
|
@ -250,9 +284,15 @@ def default_cluster(word) :
|
|
|
|
return 'Other'
|
|
|
|
return 'Other'
|
|
|
|
return unicode_names[0]
|
|
|
|
return unicode_names[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def usage():
|
|
|
|
def usage():
|
|
|
|
print("compute-wer.py : compute word error rate (WER) and align recognition results and references.")
|
|
|
|
print(
|
|
|
|
print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer")
|
|
|
|
"compute-wer.py : compute word error rate (WER) and align recognition results and references."
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
print(
|
|
|
|
|
|
|
|
" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if __name__ == '__main__':
|
|
|
|
if len(sys.argv) == 1:
|
|
|
|
if len(sys.argv) == 1:
|
|
|
@ -370,7 +410,8 @@ if __name__ == '__main__':
|
|
|
|
array = line.strip().split()
|
|
|
|
array = line.strip().split()
|
|
|
|
if len(array) == 0: continue
|
|
|
|
if len(array) == 0: continue
|
|
|
|
fid = array[0]
|
|
|
|
fid = array[0]
|
|
|
|
rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split)
|
|
|
|
rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
|
|
|
|
|
|
|
|
split)
|
|
|
|
|
|
|
|
|
|
|
|
# compute error rate on the interaction of reference file and hyp file
|
|
|
|
# compute error rate on the interaction of reference file and hyp file
|
|
|
|
for line in open(ref_file, 'r', encoding='utf-8'):
|
|
|
|
for line in open(ref_file, 'r', encoding='utf-8'):
|
|
|
@ -399,12 +440,14 @@ if __name__ == '__main__':
|
|
|
|
result = calculator.calculate(lab, rec)
|
|
|
|
result = calculator.calculate(lab, rec)
|
|
|
|
if verbose:
|
|
|
|
if verbose:
|
|
|
|
if result['all'] != 0:
|
|
|
|
if result['all'] != 0:
|
|
|
|
wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
|
|
|
|
wer = float(result['ins'] + result['sub'] + result[
|
|
|
|
|
|
|
|
'del']) * 100.0 / result['all']
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
wer = 0.0
|
|
|
|
wer = 0.0
|
|
|
|
print('WER: %4.2f %%' % wer, end=' ')
|
|
|
|
print('WER: %4.2f %%' % wer, end=' ')
|
|
|
|
print('N=%d C=%d S=%d D=%d I=%d' %
|
|
|
|
print('N=%d C=%d S=%d D=%d I=%d' %
|
|
|
|
(result['all'], result['cor'], result['sub'], result['del'], result['ins']))
|
|
|
|
(result['all'], result['cor'], result['sub'], result['del'],
|
|
|
|
|
|
|
|
result['ins']))
|
|
|
|
space = {}
|
|
|
|
space = {}
|
|
|
|
space['lab'] = []
|
|
|
|
space['lab'] = []
|
|
|
|
space['rec'] = []
|
|
|
|
space['rec'] = []
|
|
|
@ -446,30 +489,37 @@ if __name__ == '__main__':
|
|
|
|
rec1 = rec2
|
|
|
|
rec1 = rec2
|
|
|
|
|
|
|
|
|
|
|
|
if verbose:
|
|
|
|
if verbose:
|
|
|
|
print('===========================================================================')
|
|
|
|
print(
|
|
|
|
|
|
|
|
'==========================================================================='
|
|
|
|
|
|
|
|
)
|
|
|
|
print()
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
result = calculator.overall()
|
|
|
|
result = calculator.overall()
|
|
|
|
if result['all'] != 0:
|
|
|
|
if result['all'] != 0:
|
|
|
|
wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
|
|
|
|
wer = float(result['ins'] + result['sub'] + result[
|
|
|
|
|
|
|
|
'del']) * 100.0 / result['all']
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
wer = 0.0
|
|
|
|
wer = 0.0
|
|
|
|
print('Overall -> %4.2f %%' % wer, end=' ')
|
|
|
|
print('Overall -> %4.2f %%' % wer, end=' ')
|
|
|
|
print('N=%d C=%d S=%d D=%d I=%d' %
|
|
|
|
print('N=%d C=%d S=%d D=%d I=%d' %
|
|
|
|
(result['all'], result['cor'], result['sub'], result['del'], result['ins']))
|
|
|
|
(result['all'], result['cor'], result['sub'], result['del'],
|
|
|
|
|
|
|
|
result['ins']))
|
|
|
|
if not verbose:
|
|
|
|
if not verbose:
|
|
|
|
print()
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
if verbose:
|
|
|
|
if verbose:
|
|
|
|
for cluster_id in default_clusters:
|
|
|
|
for cluster_id in default_clusters:
|
|
|
|
result = calculator.cluster([ k for k in default_clusters[cluster_id] ])
|
|
|
|
result = calculator.cluster(
|
|
|
|
|
|
|
|
[k for k in default_clusters[cluster_id]])
|
|
|
|
if result['all'] != 0:
|
|
|
|
if result['all'] != 0:
|
|
|
|
wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
|
|
|
|
wer = float(result['ins'] + result['sub'] + result[
|
|
|
|
|
|
|
|
'del']) * 100.0 / result['all']
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
wer = 0.0
|
|
|
|
wer = 0.0
|
|
|
|
print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
|
|
|
|
print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
|
|
|
|
print('N=%d C=%d S=%d D=%d I=%d' %
|
|
|
|
print('N=%d C=%d S=%d D=%d I=%d' %
|
|
|
|
(result['all'], result['cor'], result['sub'], result['del'], result['ins']))
|
|
|
|
(result['all'], result['cor'], result['sub'], result['del'],
|
|
|
|
|
|
|
|
result['ins']))
|
|
|
|
if len(cluster_file) > 0: # compute separated WERs for word clusters
|
|
|
|
if len(cluster_file) > 0: # compute separated WERs for word clusters
|
|
|
|
cluster_id = ''
|
|
|
|
cluster_id = ''
|
|
|
|
cluster = []
|
|
|
|
cluster = []
|
|
|
@ -480,12 +530,14 @@ if __name__ == '__main__':
|
|
|
|
token.lstrip('</').rstrip('>') == cluster_id :
|
|
|
|
token.lstrip('</').rstrip('>') == cluster_id :
|
|
|
|
result = calculator.cluster(cluster)
|
|
|
|
result = calculator.cluster(cluster)
|
|
|
|
if result['all'] != 0:
|
|
|
|
if result['all'] != 0:
|
|
|
|
wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all']
|
|
|
|
wer = float(result['ins'] + result['sub'] + result[
|
|
|
|
|
|
|
|
'del']) * 100.0 / result['all']
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
wer = 0.0
|
|
|
|
wer = 0.0
|
|
|
|
print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
|
|
|
|
print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
|
|
|
|
print('N=%d C=%d S=%d D=%d I=%d' %
|
|
|
|
print('N=%d C=%d S=%d D=%d I=%d' %
|
|
|
|
(result['all'], result['cor'], result['sub'], result['del'], result['ins']))
|
|
|
|
(result['all'], result['cor'], result['sub'],
|
|
|
|
|
|
|
|
result['del'], result['ins']))
|
|
|
|
cluster_id = ''
|
|
|
|
cluster_id = ''
|
|
|
|
cluster = []
|
|
|
|
cluster = []
|
|
|
|
# begin of cluster reached, like <Keyword>
|
|
|
|
# begin of cluster reached, like <Keyword>
|
|
|
@ -497,4 +549,6 @@ if __name__ == '__main__':
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
cluster.append(token)
|
|
|
|
cluster.append(token)
|
|
|
|
print()
|
|
|
|
print()
|
|
|
|
print('===========================================================================')
|
|
|
|
print(
|
|
|
|
|
|
|
|
'==========================================================================='
|
|
|
|
|
|
|
|
)
|
|
|
|