|
|
@ -2,7 +2,6 @@
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# CopyRight WeNet Apache-2.0 License
|
|
|
|
# CopyRight WeNet Apache-2.0 License
|
|
|
|
import codecs
|
|
|
|
import codecs
|
|
|
|
import re
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import sys
|
|
|
|
import unicodedata
|
|
|
|
import unicodedata
|
|
|
|
|
|
|
|
|
|
|
@ -33,7 +32,8 @@ def characterize(string):
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
# some input looks like: <unk><noise>, we want to separate it to two words.
|
|
|
|
# some input looks like: <unk><noise>, we want to separate it to two words.
|
|
|
|
sep = ' '
|
|
|
|
sep = ' '
|
|
|
|
if char == '<': sep = '>'
|
|
|
|
if char == '<':
|
|
|
|
|
|
|
|
sep = '>'
|
|
|
|
j = i + 1
|
|
|
|
j = i + 1
|
|
|
|
while j < len(string):
|
|
|
|
while j < len(string):
|
|
|
|
c = string[j]
|
|
|
|
c = string[j]
|
|
|
@ -48,7 +48,8 @@ def characterize(string):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def stripoff_tags(x):
|
|
|
|
def stripoff_tags(x):
|
|
|
|
if not x: return ''
|
|
|
|
if not x:
|
|
|
|
|
|
|
|
return ''
|
|
|
|
chars = []
|
|
|
|
chars = []
|
|
|
|
i = 0
|
|
|
|
i = 0
|
|
|
|
T = len(x)
|
|
|
|
T = len(x)
|
|
|
@ -365,7 +366,7 @@ if __name__ == '__main__':
|
|
|
|
verbose = 0
|
|
|
|
verbose = 0
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
verbose = int(b)
|
|
|
|
verbose = int(b)
|
|
|
|
except:
|
|
|
|
except Exception as e:
|
|
|
|
if b == 'true' or b != '0':
|
|
|
|
if b == 'true' or b != '0':
|
|
|
|
verbose = 1
|
|
|
|
verbose = 1
|
|
|
|
continue
|
|
|
|
continue
|
|
|
@ -408,7 +409,8 @@ if __name__ == '__main__':
|
|
|
|
array = characterize(line)
|
|
|
|
array = characterize(line)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
array = line.strip().split()
|
|
|
|
array = line.strip().split()
|
|
|
|
if len(array) == 0: continue
|
|
|
|
if len(array) == 0:
|
|
|
|
|
|
|
|
continue
|
|
|
|
fid = array[0]
|
|
|
|
fid = array[0]
|
|
|
|
rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
|
|
|
|
rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
|
|
|
|
split)
|
|
|
|
split)
|
|
|
@ -419,7 +421,8 @@ if __name__ == '__main__':
|
|
|
|
array = characterize(line)
|
|
|
|
array = characterize(line)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
array = line.rstrip('\n').split()
|
|
|
|
array = line.rstrip('\n').split()
|
|
|
|
if len(array) == 0: continue
|
|
|
|
if len(array) == 0:
|
|
|
|
|
|
|
|
continue
|
|
|
|
fid = array[0]
|
|
|
|
fid = array[0]
|
|
|
|
if fid not in rec_set:
|
|
|
|
if fid not in rec_set:
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|