pull/1715/head
Hui Zhang 3 years ago
parent a054d1c452
commit f399ca9d32

@ -2,7 +2,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# CopyRight WeNet Apache-2.0 License # CopyRight WeNet Apache-2.0 License
import codecs import codecs
import re
import sys import sys
import unicodedata import unicodedata
@ -33,7 +32,8 @@ def characterize(string):
else: else:
# some input looks like: <unk><noise>, we want to separate it to two words. # some input looks like: <unk><noise>, we want to separate it to two words.
sep = ' ' sep = ' '
if char == '<': sep = '>' if char == '<':
sep = '>'
j = i + 1 j = i + 1
while j < len(string): while j < len(string):
c = string[j] c = string[j]
@ -48,7 +48,8 @@ def characterize(string):
def stripoff_tags(x): def stripoff_tags(x):
if not x: return '' if not x:
return ''
chars = [] chars = []
i = 0 i = 0
T = len(x) T = len(x)
@ -365,7 +366,7 @@ if __name__ == '__main__':
verbose = 0 verbose = 0
try: try:
verbose = int(b) verbose = int(b)
except: except Exception as e:
if b == 'true' or b != '0': if b == 'true' or b != '0':
verbose = 1 verbose = 1
continue continue
@ -408,7 +409,8 @@ if __name__ == '__main__':
array = characterize(line) array = characterize(line)
else: else:
array = line.strip().split() array = line.strip().split()
if len(array) == 0: continue if len(array) == 0:
continue
fid = array[0] fid = array[0]
rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
split) split)
@ -419,7 +421,8 @@ if __name__ == '__main__':
array = characterize(line) array = characterize(line)
else: else:
array = line.rstrip('\n').split() array = line.rstrip('\n').split()
if len(array) == 0: continue if len(array) == 0:
continue
fid = array[0] fid = array[0]
if fid not in rec_set: if fid not in rec_set:
continue continue

Loading…
Cancel
Save