[s2t] move s2t data preprocess into paddlespeech.dataset (#3189)

* move s2t data preprocess into paddlespeech.dataset

* avg model, compute wer, format rsl into paddlespeech.dataset

* fix format rsl

* fix avg ckpts
pull/3193/head
Hui Zhang 1 year ago committed by GitHub
parent 8c7859d3bc
commit df3be4acae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,15 +1,21 @@
#!/bin/bash #!/bin/bash
if [ $# != 3 ];then set -e
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1
fi
stage=0 stage=0
stop_stage=100 stop_stage=100
source utils/parse_options.sh || exit 1;
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..." echo "using $ngpu gpus..."
if [ $# != 3 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1
fi
config_path=$1 config_path=$1
decode_config_path=$2 decode_config_path=$2
ckpt_prefix=$3 ckpt_prefix=$3
@ -92,6 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi fi
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
echo "using sclite to compute cer..."
# format the reference test file for sclite # format the reference test file for sclite
python utils/format_rsl.py \ python utils/format_rsl.py \
--origin_ref data/manifest.test.raw \ --origin_ref data/manifest.test.raw \

@ -28,6 +28,7 @@ import soundfile
from paddlespeech.dataset.download import download from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack from paddlespeech.dataset.download import unpack
from paddlespeech.utils.argparse import print_arguments
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
@ -139,7 +140,7 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
def main(): def main():
print(f"args: {args}") print_arguments(args, globals())
if args.target_dir.startswith('~'): if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir) args.target_dir = os.path.expanduser(args.target_dir)

@ -28,6 +28,7 @@ import soundfile
from paddlespeech.dataset.download import download from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack from paddlespeech.dataset.download import unpack
from paddlespeech.utils.argparse import print_arguments
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
@ -205,7 +206,7 @@ def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False):
def main(): def main():
print(f"args: {args}") print_arguments(args, globals())
if args.target_dir.startswith('~'): if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir) args.target_dir = os.path.expanduser(args.target_dir)

@ -0,0 +1,20 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# s2t utils binaries.
from .avg_model import main as avg_ckpts_main
from .build_vocab import main as build_vocab_main
from .compute_mean_std import main as compute_mean_std_main
from .compute_wer import main as compute_wer_main
from .format_data import main as format_data_main
from .format_rsl import main as format_rsl_main

@ -0,0 +1,125 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import glob
import json
import os
import numpy as np
import paddle
def define_argparse():
parser = argparse.ArgumentParser(description='average model')
parser.add_argument('--dst_model', required=True, help='averaged model')
parser.add_argument(
'--ckpt_dir', required=True, help='ckpt model dir for average')
parser.add_argument(
'--val_best', action="store_true", help='averaged model')
parser.add_argument(
'--num', default=5, type=int, help='nums for averaged model')
parser.add_argument(
'--min_epoch',
default=0,
type=int,
help='min epoch used for averaging model')
parser.add_argument(
'--max_epoch',
default=65536, # Big enough
type=int,
help='max epoch used for averaging model')
args = parser.parse_args()
return args
def average_checkpoints(dst_model="",
ckpt_dir="",
val_best=True,
num=5,
min_epoch=0,
max_epoch=65536):
paddle.set_device('cpu')
val_scores = []
jsons = glob.glob(f'{ckpt_dir}/[!train]*.json')
jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
for y in jsons:
with open(y, 'r') as f:
dic_json = json.load(f)
loss = dic_json['val_loss']
epoch = dic_json['epoch']
if epoch >= min_epoch and epoch <= max_epoch:
val_scores.append((epoch, loss))
assert val_scores, f"Not find any valid checkpoints: {val_scores}"
val_scores = np.array(val_scores)
if val_best:
sort_idx = np.argsort(val_scores[:, 1])
sorted_val_scores = val_scores[sort_idx]
else:
sorted_val_scores = val_scores
beat_val_scores = sorted_val_scores[:num, 1]
selected_epochs = sorted_val_scores[:num, 0].astype(np.int64)
avg_val_score = np.mean(beat_val_scores)
print("selected val scores = " + str(beat_val_scores))
print("selected epochs = " + str(selected_epochs))
print("averaged val score = " + str(avg_val_score))
path_list = [
ckpt_dir + '/{}.pdparams'.format(int(epoch))
for epoch in sorted_val_scores[:num, 0]
]
print(path_list)
avg = None
num = args.num
assert num == len(path_list)
for path in path_list:
print(f'Processing {path}')
states = paddle.load(path)
if avg is None:
avg = states
else:
for k in avg.keys():
avg[k] += states[k]
# average
for k in avg.keys():
if avg[k] is not None:
avg[k] /= num
paddle.save(avg, args.dst_model)
print(f'Saving to {args.dst_model}')
meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
with open(meta_path, 'w') as f:
data = json.dumps({
"mode": 'val_best' if args.val_best else 'latest',
"avg_ckpt": args.dst_model,
"val_loss_mean": avg_val_score,
"ckpts": path_list,
"epochs": selected_epochs.tolist(),
"val_losses": beat_val_scores.tolist(),
})
f.write(data + "\n")
def main():
args = define_argparse()
average_checkpoints(args)
if __name__ == '__main__':
main()

@ -0,0 +1,166 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build vocabulary from manifest files.
Each item in vocabulary file is a character.
"""
import argparse
import functools
import os
import tempfile
from collections import Counter
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import BLANK
from paddlespeech.s2t.frontend.utility import SOS
from paddlespeech.s2t.frontend.utility import SPACE
from paddlespeech.s2t.frontend.utility import UNK
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons:
if isinstance(line_json['text'], str):
tokens = text_feature.tokenize(
line_json['text'], replace_space=False)
counter.update(tokens)
else:
assert isinstance(line_json['text'], list)
for text in line_json['text']:
tokens = text_feature.tokenize(text, replace_space=False)
counter.update(tokens)
def dump_text_manifest(fileobj, manifest_path, key='text'):
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons:
if isinstance(line_json[key], str):
fileobj.write(line_json[key] + "\n")
else:
assert isinstance(line_json[key], list)
for line in line_json[key]:
fileobj.write(line + "\n")
def build_vocab(manifest_paths="",
vocab_path="examples/librispeech/data/vocab.txt",
unit_type="char",
count_threshold=0,
text_keys='text',
spm_mode="unigram",
spm_vocab_size=0,
spm_model_prefix="",
spm_character_coverage=0.9995):
fout = open(vocab_path, 'w', encoding='utf-8')
fout.write(BLANK + "\n") # 0 will be used for "blank" in CTC
fout.write(UNK + '\n') # <unk> must be 1
if unit_type == 'spm':
# tools/spm_train --input=$wave_data/lang_char/input.txt
# --vocab_size=${nbpe} --model_type=${bpemode}
# --model_prefix=${bpemodel} --input_sentence_size=100000000
import sentencepiece as spm
fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
for manifest_path in manifest_paths:
_text_keys = [text_keys] if type(
text_keys) is not list else text_keys
for text_key in _text_keys:
dump_text_manifest(fp, manifest_path, key=text_key)
fp.close()
# train
spm.SentencePieceTrainer.Train(
input=fp.name,
vocab_size=spm_vocab_size,
model_type=spm_mode,
model_prefix=spm_model_prefix,
input_sentence_size=100000000,
character_coverage=spm_character_coverage)
os.unlink(fp.name)
# encode
text_feature = TextFeaturizer(unit_type, "", spm_model_prefix)
counter = Counter()
for manifest_path in manifest_paths:
count_manifest(counter, text_feature, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
tokens = []
for token, count in count_sorted:
if count < count_threshold:
break
# replace space by `<space>`
token = SPACE if token == ' ' else token
tokens.append(token)
tokens = sorted(tokens)
for token in tokens:
fout.write(token + '\n')
fout.write(SOS + "\n") # <sos/eos>
fout.close()
def define_argparse():
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('count_threshold', int, 0,
"Truncation threshold for char/word counts.Default 0, no truncate.")
add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt',
"Filepath to write the vocabulary.")
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
add_arg('text_keys', str,
'text',
"keys of the text in manifest for building vocabulary. "
"You can provide multiple k.",
nargs='+')
# bpe
add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
# yapf: disable
args = parser.parse_args()
return args
def main():
args = define_argparse()
print_arguments(args, globals())
build_vocab(**vars(args))
if __name__ == '__main__':
main()

@ -0,0 +1,106 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compute mean and std for feature normalizer, and save to file."""
import argparse
import functools
from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def compute_cmvn(manifest_path="data/librispeech/manifest.train",
output_path="data/librispeech/mean_std.npz",
num_samples=2000,
num_workers=0,
spectrum_type="linear",
feat_dim=13,
delta_delta=False,
stride_ms=10,
window_ms=20,
sample_rate=16000,
use_dB_normalization=True,
target_dB=-20):
augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(
spectrum_type=spectrum_type,
feat_dim=feat_dim,
delta_delta=delta_delta,
stride_ms=float(stride_ms),
window_ms=float(window_ms),
n_fft=None,
max_freq=None,
target_sample_rate=sample_rate,
use_dB_normalization=use_dB_normalization,
target_dB=target_dB,
dither=0.0)
def augment_and_featurize(audio_segment):
augmentation_pipeline.transform_audio(audio_segment)
return audio_featurizer.featurize(audio_segment)
normalizer = FeatureNormalizer(
mean_std_filepath=None,
manifest_path=manifest_path,
featurize_func=augment_and_featurize,
num_samples=num_samples,
num_workers=num_workers)
normalizer.write_to_file(output_path)
def define_argparse():
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('manifest_path', str,
'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('output_path', str,
'data/librispeech/mean_std.npz',
"Filepath of write mean and stddev to (.npz).")
add_arg('num_samples', int, 2000, "# of samples to for statistics.")
add_arg('num_workers',
default=0,
type=int,
help='num of subprocess workers for processing')
add_arg('spectrum_type', str,
'linear',
"Audio feature type. Options: linear, mfcc, fbank.",
choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim', int, 13, "Audio feature dim.")
add_arg('delta_delta', bool, False, "Audio feature with delta delta.")
add_arg('stride_ms', int, 10, "stride length in ms.")
add_arg('window_ms', int, 20, "stride length in ms.")
add_arg('sample_rate', int, 16000, "target sample rate.")
add_arg('use_dB_normalization', bool, True, "do dB normalization.")
add_arg('target_dB', int, -20, "target dB.")
# yapf: disable
args = parser.parse_args()
return args
def main():
args = define_argparse()
print_arguments(args, globals())
compute_cmvn(**vars(args))
if __name__ == '__main__':
main()

@ -0,0 +1,558 @@
# Copyright 2021 Mobvoi Inc. All Rights Reserved.
# flake8: noqa
import codecs
import re
import sys
import unicodedata
remove_tag = True
spacelist = [' ', '\t', '\r', '\n']
puncts = [
'!', ',', '?', '', '', '', '', '', '', '', '', '', '', '', '',
'', ''
]
def characterize(string):
res = []
i = 0
while i < len(string):
char = string[i]
if char in puncts:
i += 1
continue
cat1 = unicodedata.category(char)
#https://unicodebook.readthedocs.io/unicode.html#unicode-categories
if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned
i += 1
continue
if cat1 == 'Lo': # letter-other
res.append(char)
i += 1
else:
# some input looks like: <unk><noise>, we want to separate it to two words.
sep = ' '
if char == '<': sep = '>'
j = i + 1
while j < len(string):
c = string[j]
if ord(c) >= 128 or (c in spacelist) or (c == sep):
break
j += 1
if j < len(string) and string[j] == '>':
j += 1
res.append(string[i:j])
i = j
return res
def stripoff_tags(x):
if not x: return ''
chars = []
i = 0
T = len(x)
while i < T:
if x[i] == '<':
while i < T and x[i] != '>':
i += 1
i += 1
else:
chars.append(x[i])
i += 1
return ''.join(chars)
def normalize(sentence, ignore_words, cs, split=None):
""" sentence, ignore_words are both in unicode
"""
new_sentence = []
for token in sentence:
x = token
if not cs:
x = x.upper()
if x in ignore_words:
continue
if remove_tag:
x = stripoff_tags(x)
if not x:
continue
if split and x in split:
new_sentence += split[x]
else:
new_sentence.append(x)
return new_sentence
class Calculator:
def __init__(self):
self.data = {}
self.space = []
self.cost = {}
self.cost['cor'] = 0
self.cost['sub'] = 1
self.cost['del'] = 1
self.cost['ins'] = 1
def calculate(self, lab, rec):
# Initialization
lab.insert(0, '')
rec.insert(0, '')
while len(self.space) < len(lab):
self.space.append([])
for row in self.space:
for element in row:
element['dist'] = 0
element['error'] = 'non'
while len(row) < len(rec):
row.append({'dist': 0, 'error': 'non'})
for i in range(len(lab)):
self.space[i][0]['dist'] = i
self.space[i][0]['error'] = 'del'
for j in range(len(rec)):
self.space[0][j]['dist'] = j
self.space[0][j]['error'] = 'ins'
self.space[0][0]['error'] = 'non'
for token in lab:
if token not in self.data and len(token) > 0:
self.data[token] = {
'all': 0,
'cor': 0,
'sub': 0,
'ins': 0,
'del': 0
}
for token in rec:
if token not in self.data and len(token) > 0:
self.data[token] = {
'all': 0,
'cor': 0,
'sub': 0,
'ins': 0,
'del': 0
}
# Computing edit distance
for i, lab_token in enumerate(lab):
for j, rec_token in enumerate(rec):
if i == 0 or j == 0:
continue
min_dist = sys.maxsize
min_error = 'none'
dist = self.space[i - 1][j]['dist'] + self.cost['del']
error = 'del'
if dist < min_dist:
min_dist = dist
min_error = error
dist = self.space[i][j - 1]['dist'] + self.cost['ins']
error = 'ins'
if dist < min_dist:
min_dist = dist
min_error = error
if lab_token == rec_token:
dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
error = 'cor'
else:
dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
error = 'sub'
if dist < min_dist:
min_dist = dist
min_error = error
self.space[i][j]['dist'] = min_dist
self.space[i][j]['error'] = min_error
# Tracing back
result = {
'lab': [],
'rec': [],
'all': 0,
'cor': 0,
'sub': 0,
'ins': 0,
'del': 0
}
i = len(lab) - 1
j = len(rec) - 1
while True:
if self.space[i][j]['error'] == 'cor': # correct
if len(lab[i]) > 0:
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
result['all'] = result['all'] + 1
result['cor'] = result['cor'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, rec[j])
i = i - 1
j = j - 1
elif self.space[i][j]['error'] == 'sub': # substitution
if len(lab[i]) > 0:
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
result['all'] = result['all'] + 1
result['sub'] = result['sub'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, rec[j])
i = i - 1
j = j - 1
elif self.space[i][j]['error'] == 'del': # deletion
if len(lab[i]) > 0:
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
result['all'] = result['all'] + 1
result['del'] = result['del'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, "")
i = i - 1
elif self.space[i][j]['error'] == 'ins': # insertion
if len(rec[j]) > 0:
self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
result['ins'] = result['ins'] + 1
result['lab'].insert(0, "")
result['rec'].insert(0, rec[j])
j = j - 1
elif self.space[i][j]['error'] == 'non': # starting point
break
else: # shouldn't reach here
print(
'this should not happen , i = {i} , j = {j} , error = {error}'.
format(i=i, j=j, error=self.space[i][j]['error']))
return result
def overall(self):
result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
for token in self.data:
result['all'] = result['all'] + self.data[token]['all']
result['cor'] = result['cor'] + self.data[token]['cor']
result['sub'] = result['sub'] + self.data[token]['sub']
result['ins'] = result['ins'] + self.data[token]['ins']
result['del'] = result['del'] + self.data[token]['del']
return result
def cluster(self, data):
result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
for token in data:
if token in self.data:
result['all'] = result['all'] + self.data[token]['all']
result['cor'] = result['cor'] + self.data[token]['cor']
result['sub'] = result['sub'] + self.data[token]['sub']
result['ins'] = result['ins'] + self.data[token]['ins']
result['del'] = result['del'] + self.data[token]['del']
return result
def keys(self):
return list(self.data.keys())
def width(string):
return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
def default_cluster(word):
unicode_names = [unicodedata.name(char) for char in word]
for i in reversed(range(len(unicode_names))):
if unicode_names[i].startswith('DIGIT'): # 1
unicode_names[i] = 'Number' # 'DIGIT'
elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
# 明 / 郎
unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH'
elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
unicode_names[i].startswith('LATIN SMALL LETTER')):
# A / a
unicode_names[i] = 'English' # 'LATIN LETTER'
elif unicode_names[i].startswith('HIRAGANA LETTER'): # は こ め
unicode_names[i] = 'Japanese' # 'GANA LETTER'
elif (unicode_names[i].startswith('AMPERSAND') or
unicode_names[i].startswith('APOSTROPHE') or
unicode_names[i].startswith('COMMERCIAL AT') or
unicode_names[i].startswith('DEGREE CELSIUS') or
unicode_names[i].startswith('EQUALS SIGN') or
unicode_names[i].startswith('FULL STOP') or
unicode_names[i].startswith('HYPHEN-MINUS') or
unicode_names[i].startswith('LOW LINE') or
unicode_names[i].startswith('NUMBER SIGN') or
unicode_names[i].startswith('PLUS SIGN') or
unicode_names[i].startswith('SEMICOLON')):
# & / ' / @ / ℃ / = / . / - / _ / # / + / ;
del unicode_names[i]
else:
return 'Other'
if len(unicode_names) == 0:
return 'Other'
if len(unicode_names) == 1:
return unicode_names[0]
for i in range(len(unicode_names) - 1):
if unicode_names[i] != unicode_names[i + 1]:
return 'Other'
return unicode_names[0]
def usage():
print(
"compute-wer.py : compute word error rate (WER) and align recognition results and references."
)
print(
" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
)
def main():
# python utils/compute-wer.py --char=1 --v=1 ref hyp > rsl.error
if len(sys.argv) == 1:
usage()
sys.exit(0)
calculator = Calculator()
cluster_file = ''
ignore_words = set()
tochar = False
verbose = 1
padding_symbol = ' '
case_sensitive = False
max_words_per_line = sys.maxsize
split = None
while len(sys.argv) > 3:
a = '--maxw='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):]
del sys.argv[1]
max_words_per_line = int(b)
continue
a = '--rt='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
remove_tag = (b == 'true') or (b != '0')
continue
a = '--cs='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
case_sensitive = (b == 'true') or (b != '0')
continue
a = '--cluster='
if sys.argv[1].startswith(a):
cluster_file = sys.argv[1][len(a):]
del sys.argv[1]
continue
a = '--splitfile='
if sys.argv[1].startswith(a):
split_file = sys.argv[1][len(a):]
del sys.argv[1]
split = dict()
with codecs.open(split_file, 'r', 'utf-8') as fh:
for line in fh: # line in unicode
words = line.strip().split()
if len(words) >= 2:
split[words[0]] = words[1:]
continue
a = '--ig='
if sys.argv[1].startswith(a):
ignore_file = sys.argv[1][len(a):]
del sys.argv[1]
with codecs.open(ignore_file, 'r', 'utf-8') as fh:
for line in fh: # line in unicode
line = line.strip()
if len(line) > 0:
ignore_words.add(line)
continue
a = '--char='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
tochar = (b == 'true') or (b != '0')
continue
a = '--v='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
verbose = 0
try:
verbose = int(b)
except:
if b == 'true' or b != '0':
verbose = 1
continue
a = '--padding-symbol='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
if b == 'space':
padding_symbol = ' '
elif b == 'underline':
padding_symbol = '_'
continue
if True or sys.argv[1].startswith('-'):
#ignore invalid switch
del sys.argv[1]
continue
if not case_sensitive:
ig = set([w.upper() for w in ignore_words])
ignore_words = ig
default_clusters = {}
default_words = {}
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
rec_set = {}
if split and not case_sensitive:
newsplit = dict()
for w in split:
words = split[w]
for i in range(len(words)):
words[i] = words[i].upper()
newsplit[w.upper()] = words
split = newsplit
with codecs.open(hyp_file, 'r', 'utf-8') as fh:
for line in fh:
if tochar:
array = characterize(line)
else:
array = line.strip().split()
if len(array) == 0: continue
fid = array[0]
rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
split)
# compute error rate on the interaction of reference file and hyp file
for line in open(ref_file, 'r', encoding='utf-8'):
if tochar:
array = characterize(line)
else:
array = line.rstrip('\n').split()
if len(array) == 0: continue
fid = array[0]
if fid not in rec_set:
continue
lab = normalize(array[1:], ignore_words, case_sensitive, split)
rec = rec_set[fid]
if verbose:
print('\nutt: %s' % fid)
for word in rec + lab:
if word not in default_words:
default_cluster_name = default_cluster(word)
if default_cluster_name not in default_clusters:
default_clusters[default_cluster_name] = {}
if word not in default_clusters[default_cluster_name]:
default_clusters[default_cluster_name][word] = 1
default_words[word] = default_cluster_name
result = calculator.calculate(lab, rec)
if verbose:
if result['all'] != 0:
wer = float(result['ins'] + result['sub'] + result[
'del']) * 100.0 / result['all']
else:
wer = 0.0
print('WER: %4.2f %%' % wer, end=' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'],
result['ins']))
space = {}
space['lab'] = []
space['rec'] = []
for idx in range(len(result['lab'])):
len_lab = width(result['lab'][idx])
len_rec = width(result['rec'][idx])
length = max(len_lab, len_rec)
space['lab'].append(length - len_lab)
space['rec'].append(length - len_rec)
upper_lab = len(result['lab'])
upper_rec = len(result['rec'])
lab1, rec1 = 0, 0
while lab1 < upper_lab or rec1 < upper_rec:
if verbose > 1:
print('lab(%s):' % fid.encode('utf-8'), end=' ')
else:
print('lab:', end=' ')
lab2 = min(upper_lab, lab1 + max_words_per_line)
for idx in range(lab1, lab2):
token = result['lab'][idx]
print('{token}'.format(token=token), end='')
for n in range(space['lab'][idx]):
print(padding_symbol, end='')
print(' ', end='')
print()
if verbose > 1:
print('rec(%s):' % fid.encode('utf-8'), end=' ')
else:
print('rec:', end=' ')
rec2 = min(upper_rec, rec1 + max_words_per_line)
for idx in range(rec1, rec2):
token = result['rec'][idx]
print('{token}'.format(token=token), end='')
for n in range(space['rec'][idx]):
print(padding_symbol, end='')
print(' ', end='')
print('\n', end='\n')
lab1 = lab2
rec1 = rec2
if verbose:
print(
'==========================================================================='
)
print()
result = calculator.overall()
if result['all'] != 0:
wer = float(result['ins'] + result['sub'] + result[
'del']) * 100.0 / result['all']
else:
wer = 0.0
print('Overall -> %4.2f %%' % wer, end=' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'],
result['ins']))
if not verbose:
print()
if verbose:
for cluster_id in default_clusters:
result = calculator.cluster(
[k for k in default_clusters[cluster_id]])
if result['all'] != 0:
wer = float(result['ins'] + result['sub'] + result[
'del']) * 100.0 / result['all']
else:
wer = 0.0
print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'],
result['ins']))
if len(cluster_file) > 0: # compute separated WERs for word clusters
cluster_id = ''
cluster = []
for line in open(cluster_file, 'r', encoding='utf-8'):
for token in line.decode('utf-8').rstrip('\n').split():
# end of cluster reached, like </Keyword>
if token[0:2] == '</' and token[len(token)-1] == '>' and \
token.lstrip('</').rstrip('>') == cluster_id :
result = calculator.cluster(cluster)
if result['all'] != 0:
wer = float(result['ins'] + result['sub'] + result[
'del']) * 100.0 / result['all']
else:
wer = 0.0
print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'],
result['del'], result['ins']))
cluster_id = ''
cluster = []
# begin of cluster reached, like <Keyword>
elif token[0] == '<' and token[len(token)-1] == '>' and \
cluster_id == '' :
cluster_id = token.lstrip('<').rstrip('>')
cluster = []
# general terms, like WEATHER / CAR / ...
else:
cluster.append(token)
print()
print(
'==========================================================================='
)
if __name__ == '__main__':
main()

@ -0,0 +1,154 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest with more metadata."""
import argparse
import functools
import json
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def define_argparse():
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
add_arg('output_path', str, None, "filepath of formated manifest.", required=True)
add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.json',
"Filepath of cmvn.")
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt',
"Filepath of the vocabulary.")
# bpe
add_arg('spm_model_prefix', str, None,
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
# yapf: disable
args = parser.parse_args()
return args
def format_data(
manifest_paths="",
output_path="",
cmvn_path="examples/librispeech/data/mean_std.json",
unit_type="char",
vocab_path="examples/librispeech/data/vocab.txt",
spm_model_prefix=""):
fout = open(output_path, 'w', encoding='utf-8')
# get feat dim
filetype = cmvn_path.split(".")[-1]
mean, istd = load_cmvn(cmvn_path, filetype=filetype)
feat_dim = mean.shape[0] #(D)
print(f"Feature dim: {feat_dim}")
text_feature = TextFeaturizer(unit_type, vocab_path, spm_model_prefix)
vocab_size = text_feature.vocab_size
print(f"Vocab size: {vocab_size}")
# josnline like this
# {
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
# "utt2spk": "111-2222",
# "utt": "111-2222-333"
# }
count = 0
for manifest_path in manifest_paths:
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons:
output_json = {
"input": [],
"output": [],
'utt': line_json['utt'],
'utt2spk': line_json.get('utt2spk', 'global'),
}
# output
line = line_json['text']
if isinstance(line, str):
# only one target
tokens = text_feature.tokenize(line)
tokenids = text_feature.featurize(line)
output_json['output'].append({
'name': 'target1',
'shape': (len(tokenids), vocab_size),
'text': line,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
else:
# isinstance(line, list), multi target in one vocab
for i, item in enumerate(line, 1):
tokens = text_feature.tokenize(item)
tokenids = text_feature.featurize(item)
output_json['output'].append({
'name': f'target{i}',
'shape': (len(tokenids), vocab_size),
'text': item,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
# input
line = line_json['feat']
if isinstance(line, str):
# only one input
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
filetype = feat_type(line)
if filetype == 'sound':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplementedError('no support kaldi feat now!')
output_json['input'].append({
"name": "input1",
"shape": feat_shape,
"feat": line,
"filetype": filetype,
})
else:
# isinstance(line, list), multi input
raise NotImplementedError("not support multi input now!")
fout.write(json.dumps(output_json) + '\n')
count += 1
print(f"{manifest_paths} Examples number: {count}")
fout.close()
def main():
args = define_argparse()
print_arguments(args, globals())
format_data(**vars(args))
if __name__ == '__main__':
main()

@ -0,0 +1,143 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
format ref/hyp file for `utt text` format to compute CER/WER/MER.
norm:
BAC009S0764W0196 明确了发展目标和重点任务
BAC009S0764W0186 实现我国房地产市场的平稳运行
sclite:
加大对结构机械化环境和收集谈控机制力度(BAC009S0906W0240.wav)
河南省新乡市丰秋县刘光镇政府东五零左右(BAC009S0770W0441.wav)
"""
import argparse
import jsonlines
from paddlespeech.utils.argparse import print_arguments
def transform_hyp(origin, trans, trans_sclite):
"""
Args:
origin: The input json file which contains the model output
trans: The output file for caculate CER/WER
trans_sclite: The output file for caculate CER/WER using sclite
"""
input_dict = {}
with open(origin, "r+", encoding="utf8") as f:
for item in jsonlines.Reader(f):
input_dict[item["utt"]] = item["hyps"][0]
if trans:
with open(trans, "w+", encoding="utf8") as f:
for key in input_dict.keys():
f.write(key + " " + input_dict[key] + "\n")
print(f"transform_hyp output: {trans}")
if trans_sclite:
with open(trans_sclite, "w+") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
print(f"transform_hyp output: {trans_sclite}")
def transform_ref(origin, trans, trans_sclite):
"""
Args:
origin: The input json file which contains the model output
trans: The output file for caculate CER/WER
trans_sclite: The output file for caculate CER/WER using sclite
"""
input_dict = {}
with open(origin, "r", encoding="utf8") as f:
for item in jsonlines.Reader(f):
input_dict[item["utt"]] = item["text"]
if trans:
with open(trans, "w", encoding="utf8") as f:
for key in input_dict.keys():
f.write(key + " " + input_dict[key] + "\n")
print(f"transform_hyp output: {trans}")
if trans_sclite:
with open(trans_sclite, "w") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
print(f"transform_hyp output: {trans_sclite}")
def define_argparse():
parser = argparse.ArgumentParser(
prog='format ref/hyp file for compute CER/WER', add_help=True)
parser.add_argument(
'--origin_hyp', type=str, default="", help='origin hyp file')
parser.add_argument(
'--trans_hyp',
type=str,
default="",
help='hyp file for caculating CER/WER')
parser.add_argument(
'--trans_hyp_sclite',
type=str,
default="",
help='hyp file for caculating CER/WER by sclite')
parser.add_argument(
'--origin_ref', type=str, default="", help='origin ref file')
parser.add_argument(
'--trans_ref',
type=str,
default="",
help='ref file for caculating CER/WER')
parser.add_argument(
'--trans_ref_sclite',
type=str,
default="",
help='ref file for caculating CER/WER by sclite')
parser_args = parser.parse_args()
return parser_args
def format_result(origin_hyp="",
trans_hyp="",
trans_hyp_sclite="",
origin_ref="",
trans_ref="",
trans_ref_sclite=""):
if origin_hyp:
transform_hyp(
origin=origin_hyp, trans=trans_hyp, trans_sclite=trans_hyp_sclite)
if origin_ref:
transform_ref(
origin=origin_ref, trans=trans_ref, trans_sclite=trans_ref_sclite)
def main():
args = define_argparse()
print_arguments(args, globals())
format_result(**vars(args))
if __name__ == "__main__":
main()

@ -28,8 +28,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.socket_server import AsrRequestHandler from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
from paddlespeech.s2t.utils.socket_server import AsrTCPServer from paddlespeech.s2t.utils.socket_server import AsrTCPServer
from paddlespeech.s2t.utils.socket_server import warm_up_test from paddlespeech.s2t.utils.socket_server import warm_up_test
from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.utils.argparse import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def init_predictor(args): def init_predictor(args):

@ -26,8 +26,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.socket_server import AsrRequestHandler from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
from paddlespeech.s2t.utils.socket_server import AsrTCPServer from paddlespeech.s2t.utils.socket_server import AsrTCPServer
from paddlespeech.s2t.utils.socket_server import warm_up_test from paddlespeech.s2t.utils.socket_server import warm_up_test
from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.utils.argparse import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def start_server(config, args): def start_server(config, args):

@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -27,8 +27,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils import mp_tools from paddlespeech.s2t.utils import mp_tools
from paddlespeech.s2t.utils.checkpoint import Checkpoint from paddlespeech.s2t.utils.checkpoint import Checkpoint
from paddlespeech.s2t.utils.log import Log from paddlespeech.s2t.utils.log import Log
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.s2t.utils.utility import UpdateConfig
from paddlespeech.utils.argparse import print_arguments
logger = Log(__name__).getlog() logger = Log(__name__).getlog()

@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
# from paddlespeech.s2t.exps.u2.trainer import U2Trainer as Trainer # from paddlespeech.s2t.exps.u2.trainer import U2Trainer as Trainer

@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
model_test_alias = { model_test_alias = {
"u2": "paddlespeech.s2t.exps.u2.model:U2Tester", "u2": "paddlespeech.s2t.exps.u2.model:U2Tester",

@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
model_train_alias = { model_train_alias = {
"u2": "paddlespeech.s2t.exps.u2.model:U2Trainer", "u2": "paddlespeech.s2t.exps.u2.model:U2Trainer",

@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args): def main_sp(config, args):

@ -48,13 +48,16 @@ class TextFeaturizer():
self.unit_type = unit_type self.unit_type = unit_type
self.unk = UNK self.unk = UNK
self.maskctc = maskctc self.maskctc = maskctc
self.vocab_path_or_list = vocab
if vocab: if self.vocab_path_or_list:
self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file( self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
vocab, maskctc) vocab, maskctc)
self.vocab_size = len(self.vocab_list) self.vocab_size = len(self.vocab_list)
else: else:
logger.warning("TextFeaturizer: not have vocab file or vocab list.") logger.warning(
"TextFeaturizer: not have vocab file or vocab list. Only Tokenizer can use, can not convert to token idx"
)
if unit_type == 'spm': if unit_type == 'spm':
spm_model = spm_model_prefix + '.model' spm_model = spm_model_prefix + '.model'
@ -62,6 +65,7 @@ class TextFeaturizer():
self.sp.Load(spm_model) self.sp.Load(spm_model)
def tokenize(self, text, replace_space=True): def tokenize(self, text, replace_space=True):
"""tokenizer split text into text tokens"""
if self.unit_type == 'char': if self.unit_type == 'char':
tokens = self.char_tokenize(text, replace_space) tokens = self.char_tokenize(text, replace_space)
elif self.unit_type == 'word': elif self.unit_type == 'word':
@ -71,6 +75,7 @@ class TextFeaturizer():
return tokens return tokens
def detokenize(self, tokens): def detokenize(self, tokens):
"""tokenizer convert text tokens back to text"""
if self.unit_type == 'char': if self.unit_type == 'char':
text = self.char_detokenize(tokens) text = self.char_detokenize(tokens)
elif self.unit_type == 'word': elif self.unit_type == 'word':
@ -88,6 +93,7 @@ class TextFeaturizer():
Returns: Returns:
List[int]: List of token indices. List[int]: List of token indices.
""" """
assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
tokens = self.tokenize(text) tokens = self.tokenize(text)
ids = [] ids = []
for token in tokens: for token in tokens:
@ -107,6 +113,7 @@ class TextFeaturizer():
Returns: Returns:
str: Text. str: Text.
""" """
assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
tokens = [] tokens = []
for idx in idxs: for idx in idxs:
if idx == self.eos_id: if idx == self.eos_id:
@ -127,10 +134,10 @@ class TextFeaturizer():
""" """
text = text.strip() text = text.strip()
if replace_space: if replace_space:
text_list = [SPACE if item == " " else item for item in list(text)] tokens = [SPACE if item == " " else item for item in list(text)]
else: else:
text_list = list(text) tokens = list(text)
return text_list return tokens
def char_detokenize(self, tokens): def char_detokenize(self, tokens):
"""Character detokenizer. """Character detokenizer.

@ -29,10 +29,7 @@ from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog() logger = Log(__name__).getlog()
__all__ = [ __all__ = ["all_version", "UpdateConfig", "seed_all", "log_add"]
"all_version", "UpdateConfig", "seed_all", 'print_arguments',
'add_arguments', "log_add"
]
def all_version(): def all_version():
@ -60,51 +57,6 @@ def seed_all(seed: int=20210329):
paddle.seed(seed) paddle.seed(seed)
def print_arguments(args, info=None):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
filename = ""
if info:
filename = info["__file__"]
filename = os.path.basename(filename)
print(f"----------- {filename} Arguments -----------")
for arg, value in sorted(vars(args).items()):
print("%s: %s" % (arg, value))
print("-----------------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def log_add(args: List[int]) -> float: def log_add(args: List[int]) -> float:
"""Stable log add """Stable log add

@ -16,6 +16,8 @@ import os
import sys import sys
from typing import Text from typing import Text
import distutils
__all__ = ["print_arguments", "add_arguments", "get_commandline_args"] __all__ = ["print_arguments", "add_arguments", "get_commandline_args"]

@ -12,105 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse from paddlespeech.dataset.s2t import avg_ckpts_main
import glob
import json
import os
import numpy as np
import paddle
def main(args):
paddle.set_device('cpu')
val_scores = []
beat_val_scores = None
selected_epochs = None
jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
for y in jsons:
with open(y, 'r') as f:
dic_json = json.load(f)
loss = dic_json['val_loss']
epoch = dic_json['epoch']
if epoch >= args.min_epoch and epoch <= args.max_epoch:
val_scores.append((epoch, loss))
val_scores = np.array(val_scores)
if args.val_best:
sort_idx = np.argsort(val_scores[:, 1])
sorted_val_scores = val_scores[sort_idx]
else:
sorted_val_scores = val_scores
beat_val_scores = sorted_val_scores[:args.num, 1]
selected_epochs = sorted_val_scores[:args.num, 0].astype(np.int64)
avg_val_score = np.mean(beat_val_scores)
print("selected val scores = " + str(beat_val_scores))
print("selected epochs = " + str(selected_epochs))
print("averaged val score = " + str(avg_val_score))
path_list = [
args.ckpt_dir + '/{}.pdparams'.format(int(epoch))
for epoch in sorted_val_scores[:args.num, 0]
]
print(path_list)
avg = None
num = args.num
assert num == len(path_list)
for path in path_list:
print(f'Processing {path}')
states = paddle.load(path)
if avg is None:
avg = states
else:
for k in avg.keys():
avg[k] += states[k]
# average
for k in avg.keys():
if avg[k] is not None:
avg[k] /= num
paddle.save(avg, args.dst_model)
print(f'Saving to {args.dst_model}')
meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
with open(meta_path, 'w') as f:
data = json.dumps({
"mode": 'val_best' if args.val_best else 'latest',
"avg_ckpt": args.dst_model,
"val_loss_mean": avg_val_score,
"ckpts": path_list,
"epochs": selected_epochs.tolist(),
"val_losses": beat_val_scores.tolist(),
})
f.write(data + "\n")
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description='average model') avg_ckpts_main()
parser.add_argument('--dst_model', required=True, help='averaged model')
parser.add_argument(
'--ckpt_dir', required=True, help='ckpt model dir for average')
parser.add_argument(
'--val_best', action="store_true", help='averaged model')
parser.add_argument(
'--num', default=5, type=int, help='nums for averaged model')
parser.add_argument(
'--min_epoch',
default=0,
type=int,
help='min epoch used for averaging model')
parser.add_argument(
'--max_epoch',
default=65536, # Big enough
type=int,
help='max epoch used for averaging model')
args = parser.parse_args()
print(args)
main(args)

@ -15,134 +15,7 @@
"""Build vocabulary from manifest files. """Build vocabulary from manifest files.
Each item in vocabulary file is a character. Each item in vocabulary file is a character.
""" """
import argparse from paddlespeech.dataset.s2t import build_vocab_main
import functools
import os
import tempfile
from collections import Counter
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import BLANK
from paddlespeech.s2t.frontend.utility import SOS
from paddlespeech.s2t.frontend.utility import SPACE
from paddlespeech.s2t.frontend.utility import UNK
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('count_threshold', int, 0,
"Truncation threshold for char/word counts.Default 0, no truncate.")
add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt',
"Filepath to write the vocabulary.")
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
add_arg('text_keys', str,
'text',
"keys of the text in manifest for building vocabulary. "
"You can provide multiple k.",
nargs='+')
# bpe
add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
# yapf: disable
args = parser.parse_args()
def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons:
if isinstance(line_json['text'], str):
line = text_feature.tokenize(line_json['text'], replace_space=False)
counter.update(line)
else:
assert isinstance(line_json['text'], list)
for text in line_json['text']:
line = text_feature.tokenize(text, replace_space=False)
counter.update(line)
def dump_text_manifest(fileobj, manifest_path, key='text'):
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons:
if isinstance(line_json[key], str):
fileobj.write(line_json[key] + "\n")
else:
assert isinstance(line_json[key], list)
for line in line_json[key]:
fileobj.write(line + "\n")
def main():
print_arguments(args, globals())
fout = open(args.vocab_path, 'w', encoding='utf-8')
fout.write(BLANK + "\n") # 0 will be used for "blank" in CTC
fout.write(UNK + '\n') # <unk> must be 1
if args.unit_type == 'spm':
# tools/spm_train --input=$wave_data/lang_char/input.txt
# --vocab_size=${nbpe} --model_type=${bpemode}
# --model_prefix=${bpemodel} --input_sentence_size=100000000
import sentencepiece as spm
fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
for manifest_path in args.manifest_paths:
text_keys = [args.text_keys] if type(args.text_keys) is not list else args.text_keys
for text_key in text_keys:
dump_text_manifest(fp, manifest_path, key=text_key)
fp.close()
# train
spm.SentencePieceTrainer.Train(
input=fp.name,
vocab_size=args.spm_vocab_size,
model_type=args.spm_mode,
model_prefix=args.spm_model_prefix,
input_sentence_size=100000000,
character_coverage=args.spm_character_coverage)
os.unlink(fp.name)
# encode
text_feature = TextFeaturizer(args.unit_type, "", args.spm_model_prefix)
counter = Counter()
for manifest_path in args.manifest_paths:
count_manifest(counter, text_feature, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
tokens = []
for token, count in count_sorted:
if count < args.count_threshold:
break
# replace space by `<space>`
token = SPACE if token == ' ' else token
tokens.append(token)
tokens = sorted(tokens)
for token in tokens:
fout.write(token + '\n')
fout.write(SOS + "\n") # <sos/eos>
fout.close()
if __name__ == '__main__': if __name__ == '__main__':
main() build_vocab_main()

@ -1,554 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2021 Mobvoi Inc. All Rights Reserved. # Copyright 2021 Mobvoi Inc. All Rights Reserved.
import codecs from paddlespeech.dataset.s2t import compute_wer_main
import re
import sys
import unicodedata
remove_tag = True
spacelist = [' ', '\t', '\r', '\n']
puncts = [
'!', ',', '?', '', '', '', '', '', '', '', '', '', '', '', '',
'', ''
]
def characterize(string):
res = []
i = 0
while i < len(string):
char = string[i]
if char in puncts:
i += 1
continue
cat1 = unicodedata.category(char)
#https://unicodebook.readthedocs.io/unicode.html#unicode-categories
if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned
i += 1
continue
if cat1 == 'Lo': # letter-other
res.append(char)
i += 1
else:
# some input looks like: <unk><noise>, we want to separate it to two words.
sep = ' '
if char == '<': sep = '>'
j = i + 1
while j < len(string):
c = string[j]
if ord(c) >= 128 or (c in spacelist) or (c == sep):
break
j += 1
if j < len(string) and string[j] == '>':
j += 1
res.append(string[i:j])
i = j
return res
def stripoff_tags(x):
if not x: return ''
chars = []
i = 0
T = len(x)
while i < T:
if x[i] == '<':
while i < T and x[i] != '>':
i += 1
i += 1
else:
chars.append(x[i])
i += 1
return ''.join(chars)
def normalize(sentence, ignore_words, cs, split=None):
""" sentence, ignore_words are both in unicode
"""
new_sentence = []
for token in sentence:
x = token
if not cs:
x = x.upper()
if x in ignore_words:
continue
if remove_tag:
x = stripoff_tags(x)
if not x:
continue
if split and x in split:
new_sentence += split[x]
else:
new_sentence.append(x)
return new_sentence
class Calculator:
def __init__(self):
self.data = {}
self.space = []
self.cost = {}
self.cost['cor'] = 0
self.cost['sub'] = 1
self.cost['del'] = 1
self.cost['ins'] = 1
def calculate(self, lab, rec):
# Initialization
lab.insert(0, '')
rec.insert(0, '')
while len(self.space) < len(lab):
self.space.append([])
for row in self.space:
for element in row:
element['dist'] = 0
element['error'] = 'non'
while len(row) < len(rec):
row.append({'dist': 0, 'error': 'non'})
for i in range(len(lab)):
self.space[i][0]['dist'] = i
self.space[i][0]['error'] = 'del'
for j in range(len(rec)):
self.space[0][j]['dist'] = j
self.space[0][j]['error'] = 'ins'
self.space[0][0]['error'] = 'non'
for token in lab:
if token not in self.data and len(token) > 0:
self.data[token] = {
'all': 0,
'cor': 0,
'sub': 0,
'ins': 0,
'del': 0
}
for token in rec:
if token not in self.data and len(token) > 0:
self.data[token] = {
'all': 0,
'cor': 0,
'sub': 0,
'ins': 0,
'del': 0
}
# Computing edit distance
for i, lab_token in enumerate(lab):
for j, rec_token in enumerate(rec):
if i == 0 or j == 0:
continue
min_dist = sys.maxsize
min_error = 'none'
dist = self.space[i - 1][j]['dist'] + self.cost['del']
error = 'del'
if dist < min_dist:
min_dist = dist
min_error = error
dist = self.space[i][j - 1]['dist'] + self.cost['ins']
error = 'ins'
if dist < min_dist:
min_dist = dist
min_error = error
if lab_token == rec_token:
dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
error = 'cor'
else:
dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
error = 'sub'
if dist < min_dist:
min_dist = dist
min_error = error
self.space[i][j]['dist'] = min_dist
self.space[i][j]['error'] = min_error
# Tracing back
result = {
'lab': [],
'rec': [],
'all': 0,
'cor': 0,
'sub': 0,
'ins': 0,
'del': 0
}
i = len(lab) - 1
j = len(rec) - 1
while True:
if self.space[i][j]['error'] == 'cor': # correct
if len(lab[i]) > 0:
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
result['all'] = result['all'] + 1
result['cor'] = result['cor'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, rec[j])
i = i - 1
j = j - 1
elif self.space[i][j]['error'] == 'sub': # substitution
if len(lab[i]) > 0:
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
result['all'] = result['all'] + 1
result['sub'] = result['sub'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, rec[j])
i = i - 1
j = j - 1
elif self.space[i][j]['error'] == 'del': # deletion
if len(lab[i]) > 0:
self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
result['all'] = result['all'] + 1
result['del'] = result['del'] + 1
result['lab'].insert(0, lab[i])
result['rec'].insert(0, "")
i = i - 1
elif self.space[i][j]['error'] == 'ins': # insertion
if len(rec[j]) > 0:
self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
result['ins'] = result['ins'] + 1
result['lab'].insert(0, "")
result['rec'].insert(0, rec[j])
j = j - 1
elif self.space[i][j]['error'] == 'non': # starting point
break
else: # shouldn't reach here
print(
'this should not happen , i = {i} , j = {j} , error = {error}'.
format(i=i, j=j, error=self.space[i][j]['error']))
return result
def overall(self):
result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
for token in self.data:
result['all'] = result['all'] + self.data[token]['all']
result['cor'] = result['cor'] + self.data[token]['cor']
result['sub'] = result['sub'] + self.data[token]['sub']
result['ins'] = result['ins'] + self.data[token]['ins']
result['del'] = result['del'] + self.data[token]['del']
return result
def cluster(self, data):
result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
for token in data:
if token in self.data:
result['all'] = result['all'] + self.data[token]['all']
result['cor'] = result['cor'] + self.data[token]['cor']
result['sub'] = result['sub'] + self.data[token]['sub']
result['ins'] = result['ins'] + self.data[token]['ins']
result['del'] = result['del'] + self.data[token]['del']
return result
def keys(self):
return list(self.data.keys())
def width(string):
return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
def default_cluster(word):
unicode_names = [unicodedata.name(char) for char in word]
for i in reversed(range(len(unicode_names))):
if unicode_names[i].startswith('DIGIT'): # 1
unicode_names[i] = 'Number' # 'DIGIT'
elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
# 明 / 郎
unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH'
elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
unicode_names[i].startswith('LATIN SMALL LETTER')):
# A / a
unicode_names[i] = 'English' # 'LATIN LETTER'
elif unicode_names[i].startswith('HIRAGANA LETTER'): # は こ め
unicode_names[i] = 'Japanese' # 'GANA LETTER'
elif (unicode_names[i].startswith('AMPERSAND') or
unicode_names[i].startswith('APOSTROPHE') or
unicode_names[i].startswith('COMMERCIAL AT') or
unicode_names[i].startswith('DEGREE CELSIUS') or
unicode_names[i].startswith('EQUALS SIGN') or
unicode_names[i].startswith('FULL STOP') or
unicode_names[i].startswith('HYPHEN-MINUS') or
unicode_names[i].startswith('LOW LINE') or
unicode_names[i].startswith('NUMBER SIGN') or
unicode_names[i].startswith('PLUS SIGN') or
unicode_names[i].startswith('SEMICOLON')):
# & / ' / @ / ℃ / = / . / - / _ / # / + / ;
del unicode_names[i]
else:
return 'Other'
if len(unicode_names) == 0:
return 'Other'
if len(unicode_names) == 1:
return unicode_names[0]
for i in range(len(unicode_names) - 1):
if unicode_names[i] != unicode_names[i + 1]:
return 'Other'
return unicode_names[0]
def usage():
print(
"compute-wer.py : compute word error rate (WER) and align recognition results and references."
)
print(
" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
)
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) == 1: compute_wer_main()
usage()
sys.exit(0)
calculator = Calculator()
cluster_file = ''
ignore_words = set()
tochar = False
verbose = 1
padding_symbol = ' '
case_sensitive = False
max_words_per_line = sys.maxsize
split = None
while len(sys.argv) > 3:
a = '--maxw='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):]
del sys.argv[1]
max_words_per_line = int(b)
continue
a = '--rt='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
remove_tag = (b == 'true') or (b != '0')
continue
a = '--cs='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
case_sensitive = (b == 'true') or (b != '0')
continue
a = '--cluster='
if sys.argv[1].startswith(a):
cluster_file = sys.argv[1][len(a):]
del sys.argv[1]
continue
a = '--splitfile='
if sys.argv[1].startswith(a):
split_file = sys.argv[1][len(a):]
del sys.argv[1]
split = dict()
with codecs.open(split_file, 'r', 'utf-8') as fh:
for line in fh: # line in unicode
words = line.strip().split()
if len(words) >= 2:
split[words[0]] = words[1:]
continue
a = '--ig='
if sys.argv[1].startswith(a):
ignore_file = sys.argv[1][len(a):]
del sys.argv[1]
with codecs.open(ignore_file, 'r', 'utf-8') as fh:
for line in fh: # line in unicode
line = line.strip()
if len(line) > 0:
ignore_words.add(line)
continue
a = '--char='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
tochar = (b == 'true') or (b != '0')
continue
a = '--v='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
verbose = 0
try:
verbose = int(b)
except:
if b == 'true' or b != '0':
verbose = 1
continue
a = '--padding-symbol='
if sys.argv[1].startswith(a):
b = sys.argv[1][len(a):].lower()
del sys.argv[1]
if b == 'space':
padding_symbol = ' '
elif b == 'underline':
padding_symbol = '_'
continue
if True or sys.argv[1].startswith('-'):
#ignore invalid switch
del sys.argv[1]
continue
if not case_sensitive:
ig = set([w.upper() for w in ignore_words])
ignore_words = ig
default_clusters = {}
default_words = {}
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
rec_set = {}
if split and not case_sensitive:
newsplit = dict()
for w in split:
words = split[w]
for i in range(len(words)):
words[i] = words[i].upper()
newsplit[w.upper()] = words
split = newsplit
with codecs.open(hyp_file, 'r', 'utf-8') as fh:
for line in fh:
if tochar:
array = characterize(line)
else:
array = line.strip().split()
if len(array) == 0: continue
fid = array[0]
rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
split)
# compute error rate on the interaction of reference file and hyp file
for line in open(ref_file, 'r', encoding='utf-8'):
if tochar:
array = characterize(line)
else:
array = line.rstrip('\n').split()
if len(array) == 0: continue
fid = array[0]
if fid not in rec_set:
continue
lab = normalize(array[1:], ignore_words, case_sensitive, split)
rec = rec_set[fid]
if verbose:
print('\nutt: %s' % fid)
for word in rec + lab:
if word not in default_words:
default_cluster_name = default_cluster(word)
if default_cluster_name not in default_clusters:
default_clusters[default_cluster_name] = {}
if word not in default_clusters[default_cluster_name]:
default_clusters[default_cluster_name][word] = 1
default_words[word] = default_cluster_name
result = calculator.calculate(lab, rec)
if verbose:
if result['all'] != 0:
wer = float(result['ins'] + result['sub'] + result[
'del']) * 100.0 / result['all']
else:
wer = 0.0
print('WER: %4.2f %%' % wer, end=' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'],
result['ins']))
space = {}
space['lab'] = []
space['rec'] = []
for idx in range(len(result['lab'])):
len_lab = width(result['lab'][idx])
len_rec = width(result['rec'][idx])
length = max(len_lab, len_rec)
space['lab'].append(length - len_lab)
space['rec'].append(length - len_rec)
upper_lab = len(result['lab'])
upper_rec = len(result['rec'])
lab1, rec1 = 0, 0
while lab1 < upper_lab or rec1 < upper_rec:
if verbose > 1:
print('lab(%s):' % fid.encode('utf-8'), end=' ')
else:
print('lab:', end=' ')
lab2 = min(upper_lab, lab1 + max_words_per_line)
for idx in range(lab1, lab2):
token = result['lab'][idx]
print('{token}'.format(token=token), end='')
for n in range(space['lab'][idx]):
print(padding_symbol, end='')
print(' ', end='')
print()
if verbose > 1:
print('rec(%s):' % fid.encode('utf-8'), end=' ')
else:
print('rec:', end=' ')
rec2 = min(upper_rec, rec1 + max_words_per_line)
for idx in range(rec1, rec2):
token = result['rec'][idx]
print('{token}'.format(token=token), end='')
for n in range(space['rec'][idx]):
print(padding_symbol, end='')
print(' ', end='')
print('\n', end='\n')
lab1 = lab2
rec1 = rec2
if verbose:
print(
'==========================================================================='
)
print()
result = calculator.overall()
if result['all'] != 0:
wer = float(result['ins'] + result['sub'] + result[
'del']) * 100.0 / result['all']
else:
wer = 0.0
print('Overall -> %4.2f %%' % wer, end=' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'],
result['ins']))
if not verbose:
print()
if verbose:
for cluster_id in default_clusters:
result = calculator.cluster(
[k for k in default_clusters[cluster_id]])
if result['all'] != 0:
wer = float(result['ins'] + result['sub'] + result[
'del']) * 100.0 / result['all']
else:
wer = 0.0
print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'], result['del'],
result['ins']))
if len(cluster_file) > 0: # compute separated WERs for word clusters
cluster_id = ''
cluster = []
for line in open(cluster_file, 'r', encoding='utf-8'):
for token in line.decode('utf-8').rstrip('\n').split():
# end of cluster reached, like </Keyword>
if token[0:2] == '</' and token[len(token)-1] == '>' and \
token.lstrip('</').rstrip('>') == cluster_id :
result = calculator.cluster(cluster)
if result['all'] != 0:
wer = float(result['ins'] + result['sub'] + result[
'del']) * 100.0 / result['all']
else:
wer = 0.0
print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
print('N=%d C=%d S=%d D=%d I=%d' %
(result['all'], result['cor'], result['sub'],
result['del'], result['ins']))
cluster_id = ''
cluster = []
# begin of cluster reached, like <Keyword>
elif token[0] == '<' and token[len(token)-1] == '>' and \
cluster_id == '' :
cluster_id = token.lstrip('<').rstrip('>')
cluster = []
# general terms, like WEATHER / CAR / ...
else:
cluster.append(token)
print()
print(
'==========================================================================='
)

@ -13,75 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""Compute mean and std for feature normalizer, and save to file.""" """Compute mean and std for feature normalizer, and save to file."""
import argparse from paddlespeech.dataset.s2t import compute_mean_std_main
import functools
from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples', int, 2000, "# of samples to for statistics.")
add_arg('spectrum_type', str,
'linear',
"Audio feature type. Options: linear, mfcc, fbank.",
choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim', int, 13, "Audio feature dim.")
add_arg('delta_delta', bool, False, "Audio feature with delta delta.")
add_arg('stride_ms', int, 10, "stride length in ms.")
add_arg('window_ms', int, 20, "stride length in ms.")
add_arg('sample_rate', int, 16000, "target sample rate.")
add_arg('use_dB_normalization', bool, True, "do dB normalization.")
add_arg('target_dB', int, -20, "target dB.")
add_arg('manifest_path', str,
'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('num_workers',
default=0,
type=int,
help='num of subprocess workers for processing')
add_arg('output_path', str,
'data/librispeech/mean_std.npz',
"Filepath of write mean and stddev to (.npz).")
# yapf: disable
args = parser.parse_args()
def main():
print_arguments(args, globals())
augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(
spectrum_type=args.spectrum_type,
feat_dim=args.feat_dim,
delta_delta=args.delta_delta,
stride_ms=float(args.stride_ms),
window_ms=float(args.window_ms),
n_fft=None,
max_freq=None,
target_sample_rate=args.sample_rate,
use_dB_normalization=args.use_dB_normalization,
target_dB=args.target_dB,
dither=0.0)
def augment_and_featurize(audio_segment):
augmentation_pipeline.transform_audio(audio_segment)
return audio_featurizer.featurize(audio_segment)
normalizer = FeatureNormalizer(
mean_std_filepath=None,
manifest_path=args.manifest_path,
featurize_func=augment_and_featurize,
num_samples=args.num_samples,
num_workers=args.num_workers)
normalizer.write_to_file(args.output_path)
if __name__ == '__main__': if __name__ == '__main__':
main() compute_mean_std_main()

@ -13,130 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""format manifest with more metadata.""" """format manifest with more metadata."""
import argparse from paddlespeech.dataset.s2t import format_data_main
import functools
import json
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.json',
"Filepath of cmvn.")
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt',
"Filepath of the vocabulary.")
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
# bpe
add_arg('spm_model_prefix', str, None,
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
add_arg('output_path', str, None, "filepath of formated manifest.", required=True)
# yapf: disable
args = parser.parse_args()
def main():
print_arguments(args, globals())
fout = open(args.output_path, 'w', encoding='utf-8')
# get feat dim
filetype = args.cmvn_path.split(".")[-1]
mean, istd = load_cmvn(args.cmvn_path, filetype=filetype)
feat_dim = mean.shape[0] #(D)
print(f"Feature dim: {feat_dim}")
text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
vocab_size = text_feature.vocab_size
print(f"Vocab size: {vocab_size}")
# josnline like this
# {
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
# "utt2spk": "111-2222",
# "utt": "111-2222-333"
# }
count = 0
for manifest_path in args.manifest_paths:
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons:
output_json = {
"input": [],
"output": [],
'utt': line_json['utt'],
'utt2spk': line_json.get('utt2spk', 'global'),
}
# output
line = line_json['text']
if isinstance(line, str):
# only one target
tokens = text_feature.tokenize(line)
tokenids = text_feature.featurize(line)
output_json['output'].append({
'name': 'target1',
'shape': (len(tokenids), vocab_size),
'text': line,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
else:
# isinstance(line, list), multi target in one vocab
for i, item in enumerate(line, 1):
tokens = text_feature.tokenize(item)
tokenids = text_feature.featurize(item)
output_json['output'].append({
'name': f'target{i}',
'shape': (len(tokenids), vocab_size),
'text': item,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
# input
line = line_json['feat']
if isinstance(line, str):
# only one input
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
filetype = feat_type(line)
if filetype == 'sound':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplementedError('no support kaldi feat now!')
output_json['input'].append({
"name": "input1",
"shape": feat_shape,
"feat": line,
"filetype": filetype,
})
else:
# isinstance(line, list), multi input
raise NotImplementedError("not support multi input now!")
fout.write(json.dumps(output_json) + '\n')
count += 1
print(f"{args.manifest_paths} Examples number: {count}")
fout.close()
if __name__ == '__main__': if __name__ == '__main__':
main() format_data_main()

@ -11,96 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse from paddlespeech.dataset.s2t import format_rsl_main
import jsonlines if __name__ == '__main__':
format_rsl_main()
def trans_hyp(origin_hyp, trans_hyp=None, trans_hyp_sclite=None):
"""
Args:
origin_hyp: The input json file which contains the model output
trans_hyp: The output file for caculate CER/WER
trans_hyp_sclite: The output file for caculate CER/WER using sclite
"""
input_dict = {}
with open(origin_hyp, "r+", encoding="utf8") as f:
for item in jsonlines.Reader(f):
input_dict[item["utt"]] = item["hyps"][0]
if trans_hyp is not None:
with open(trans_hyp, "w+", encoding="utf8") as f:
for key in input_dict.keys():
f.write(key + " " + input_dict[key] + "\n")
if trans_hyp_sclite is not None:
with open(trans_hyp_sclite, "w+") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
def trans_ref(origin_ref, trans_ref=None, trans_ref_sclite=None):
"""
Args:
origin_hyp: The input json file which contains the model output
trans_hyp: The output file for caculate CER/WER
trans_hyp_sclite: The output file for caculate CER/WER using sclite
"""
input_dict = {}
with open(origin_ref, "r", encoding="utf8") as f:
for item in jsonlines.Reader(f):
input_dict[item["utt"]] = item["text"]
if trans_ref is not None:
with open(trans_ref, "w", encoding="utf8") as f:
for key in input_dict.keys():
f.write(key + " " + input_dict[key] + "\n")
if trans_ref_sclite is not None:
with open(trans_ref_sclite, "w") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog='format hyp file for compute CER/WER', add_help=True)
parser.add_argument(
'--origin_hyp', type=str, default=None, help='origin hyp file')
parser.add_argument(
'--trans_hyp',
type=str,
default=None,
help='hyp file for caculating CER/WER')
parser.add_argument(
'--trans_hyp_sclite',
type=str,
default=None,
help='hyp file for caculating CER/WER by sclite')
parser.add_argument(
'--origin_ref', type=str, default=None, help='origin ref file')
parser.add_argument(
'--trans_ref',
type=str,
default=None,
help='ref file for caculating CER/WER')
parser.add_argument(
'--trans_ref_sclite',
type=str,
default=None,
help='ref file for caculating CER/WER by sclite')
parser_args = parser.parse_args()
if parser_args.origin_hyp is not None:
trans_hyp(
origin_hyp=parser_args.origin_hyp,
trans_hyp=parser_args.trans_hyp,
trans_hyp_sclite=parser_args.trans_hyp_sclite, )
if parser_args.origin_ref is not None:
trans_ref(
origin_ref=parser_args.origin_ref,
trans_ref=parser_args.trans_ref,
trans_ref_sclite=parser_args.trans_ref_sclite, )

@ -22,8 +22,8 @@ import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.io.utility import feat_type from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments from paddlespeech.utils.argparse import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments from paddlespeech.utils.argparse import print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser) add_arg = functools.partial(add_arguments, argparser=parser)

Loading…
Cancel
Save