pull/1068/head
commit
a070524d37
@ -0,0 +1 @@
|
|||||||
|
# Demos for PaddleSpeech
|
@ -0,0 +1,8 @@
|
|||||||
|
# Style FastSpeech2
|
||||||
|
You can change the `pitch`、`duration` and `energy` of `FastSpeech2`.
|
||||||
|
|
||||||
|
Run the following command line to get started:
|
||||||
|
```
|
||||||
|
./run.sh
|
||||||
|
```
|
||||||
|
For more details, please see `style_syn.py`
|
@ -0,0 +1,2 @@
|
|||||||
|
--sample-frequency=16000
|
||||||
|
--num-mel-bins=80
|
@ -0,0 +1 @@
|
|||||||
|
--sample-frequency=16000
|
@ -0,0 +1,48 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
|
||||||
|
# 2021 PaddlePaddle
|
||||||
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
|
. ./path.sh
|
||||||
|
|
||||||
|
if [ "$#" -ne 2 ]; then
|
||||||
|
echo "Usage: $0 <set> <lang>>"
|
||||||
|
echo "e.g.: $0 dev"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
set=$1
|
||||||
|
lang=$2
|
||||||
|
export LC_ALL=en_US.UTF-8
|
||||||
|
# Copy stuff intoc its final locations [this has been moved from the format_data script]
|
||||||
|
# for En
|
||||||
|
mkdir -p ${set}.en
|
||||||
|
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
|
||||||
|
if [ -f ${set}/${f} ]; then
|
||||||
|
sort ${set}/${f} > ${set}.en/${f}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text
|
||||||
|
|
||||||
|
utils/fix_data_dir.sh ${set}.en
|
||||||
|
if [ -f ${set}.en/feats.scp ]; then
|
||||||
|
utils/validate_data_dir.sh ${set}.en || exit 1;
|
||||||
|
else
|
||||||
|
utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
# for target language
|
||||||
|
mkdir -p ${set}.${lang}
|
||||||
|
for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do
|
||||||
|
if [ -f ${set}/${f} ]; then
|
||||||
|
sort ${set}/${f} > ${set}.${lang}/${f}
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text
|
||||||
|
utils/fix_data_dir.sh ${set}.${lang}
|
||||||
|
if [ -f ${set}.${lang}/feats.scp ]; then
|
||||||
|
utils/validate_data_dir.sh ${set}.${lang} || exit 1;
|
||||||
|
else
|
||||||
|
utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1;
|
||||||
|
fi
|
@ -0,0 +1,27 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
with open(args.json_file, 'r') as fin:
|
||||||
|
data_json = json.load(fin)
|
||||||
|
|
||||||
|
with open(args.manifest_file, 'w') as fout:
|
||||||
|
for key, value in data_json['utts'].items():
|
||||||
|
value['utt'] = key
|
||||||
|
fout.write(json.dumps(value, ensure_ascii=False))
|
||||||
|
fout.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
'--json-file', type=str, default=None, help="espnet data json file.")
|
||||||
|
parser.add_argument(
|
||||||
|
'--manifest-file',
|
||||||
|
type=str,
|
||||||
|
default='manifest.train',
|
||||||
|
help='manifest data json line file.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
@ -0,0 +1,25 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use warnings;
|
||||||
|
use strict;
|
||||||
|
|
||||||
|
binmode(STDIN,":utf8");
|
||||||
|
binmode(STDOUT,":utf8");
|
||||||
|
|
||||||
|
while(<STDIN>) {
|
||||||
|
$_ = " $_ ";
|
||||||
|
|
||||||
|
# remove punctuation except apostrophe
|
||||||
|
s/<space>/spacemark/g; # for scoring
|
||||||
|
s/'/apostrophe/g;
|
||||||
|
s/[[:punct:]]//g;
|
||||||
|
s/apostrophe/'/g;
|
||||||
|
s/spacemark/<space>/g; # for scoring
|
||||||
|
|
||||||
|
# remove whitespace
|
||||||
|
s/\s+/ /g;
|
||||||
|
s/^\s+//;
|
||||||
|
s/\s+$//;
|
||||||
|
|
||||||
|
print "$_\n";
|
||||||
|
}
|
@ -0,0 +1,104 @@
|
|||||||
|
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
import argparse
|
||||||
|
import codecs
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
# org_split = 'train-split/train-segment'
|
||||||
|
# text_file = 'En-Zh/train.en-zh'
|
||||||
|
# data_split = 'train'
|
||||||
|
def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list,
|
||||||
|
data_split_list):
|
||||||
|
|
||||||
|
for org_split, text_file, data_split in zip(wav_dir_list, text_file_list,
|
||||||
|
data_split_list):
|
||||||
|
local_data_split_dir = os.path.join(tgt_dir, data_split)
|
||||||
|
|
||||||
|
os.makedirs(local_data_split_dir, exist_ok=True)
|
||||||
|
utts = []
|
||||||
|
utt2spk = {}
|
||||||
|
with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \
|
||||||
|
open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf:
|
||||||
|
for files in os.listdir(os.path.join(src_dir, org_split)):
|
||||||
|
files = files.strip()
|
||||||
|
file_path = os.path.join(src_dir, org_split, files)
|
||||||
|
size = os.path.getsize(file_path)
|
||||||
|
if size <= 30000:
|
||||||
|
continue
|
||||||
|
utt = files.split('.')[0]
|
||||||
|
audio_name = utt.split('_')[0]
|
||||||
|
#format the name of utterance
|
||||||
|
while len(audio_name) < 6:
|
||||||
|
utt = '0' + utt
|
||||||
|
audio_name = '0' + audio_name
|
||||||
|
utt = 'ted-en-zh-' + utt
|
||||||
|
utts.append(utt)
|
||||||
|
spk = utt.split('_')[0]
|
||||||
|
utt2spk[utt] = spk
|
||||||
|
assert len(spk) == 16, "%r" % spk
|
||||||
|
print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf)
|
||||||
|
for utt in sorted(utts):
|
||||||
|
print(utt, utt2spk[utt], file=utt2spk_wf)
|
||||||
|
|
||||||
|
with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \
|
||||||
|
open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \
|
||||||
|
open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \
|
||||||
|
codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8',
|
||||||
|
errors='ignore') as rf:
|
||||||
|
count = 0
|
||||||
|
for line in rf:
|
||||||
|
line = line.strip()
|
||||||
|
line_spl = line.split('\t')
|
||||||
|
assert len(line_spl) == 3, "%r" % line
|
||||||
|
wav, en, zh = line_spl
|
||||||
|
assert wav.endswith('wav'), "%r" % wav[-3:]
|
||||||
|
utt = wav.split('.')[0]
|
||||||
|
audio_name = utt.split('_')[0]
|
||||||
|
while len(audio_name) < 6:
|
||||||
|
utt = '0' + utt
|
||||||
|
audio_name = '0' + audio_name
|
||||||
|
utt = 'ted-en-zh-' + utt
|
||||||
|
print(utt, file=yaml_wf)
|
||||||
|
print(en.lower(), file=en_wf)
|
||||||
|
print(zh, file=zh_wf)
|
||||||
|
count += 1
|
||||||
|
print('%s set lines count: %d' % (data_split, count))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--src-dir",
|
||||||
|
default="",
|
||||||
|
type=str,
|
||||||
|
help="Directory to kaldi splited data. (default: %(default)s)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--tgt-dir",
|
||||||
|
default="local/ted_en_zh",
|
||||||
|
type=str,
|
||||||
|
help="Directory to save processed data. (default: %(default)s)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
wav_dir_list = [
|
||||||
|
'train-split/train-segment', 'test-segment/tst2014',
|
||||||
|
'test-segment/tst2015'
|
||||||
|
]
|
||||||
|
text_file_list = [
|
||||||
|
'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh'
|
||||||
|
]
|
||||||
|
data_split_list = ['train', 'dev', 'test']
|
||||||
|
data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list,
|
||||||
|
data_split_list)
|
@ -0,0 +1 @@
|
|||||||
|
../../../tools/kaldi/egs/wsj/s5/steps
|
@ -0,0 +1 @@
|
|||||||
|
../../../tools/kaldi/egs/wsj/s5/utils
|
@ -1,103 +0,0 @@
|
|||||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
from typing import List
|
|
||||||
from typing import Optional
|
|
||||||
from typing import Union
|
|
||||||
|
|
||||||
import paddle
|
|
||||||
|
|
||||||
from ..executor import BaseExecutor
|
|
||||||
from ..utils import cli_register
|
|
||||||
|
|
||||||
__all__ = ['S2TExecutor']
|
|
||||||
|
|
||||||
|
|
||||||
@cli_register(
|
|
||||||
name='paddlespeech.s2t', description='Speech to text infer command.')
|
|
||||||
class S2TExecutor(BaseExecutor):
|
|
||||||
def __init__(self):
|
|
||||||
super(S2TExecutor, self).__init__()
|
|
||||||
|
|
||||||
self.parser = argparse.ArgumentParser(
|
|
||||||
prog='paddlespeech.s2t', add_help=True)
|
|
||||||
self.parser.add_argument(
|
|
||||||
'--config',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='Config of s2t task. Use deault config when it is None.')
|
|
||||||
self.parser.add_argument(
|
|
||||||
'--input', type=str, help='Audio file to recognize.')
|
|
||||||
self.parser.add_argument(
|
|
||||||
'--device',
|
|
||||||
type=str,
|
|
||||||
default='cpu',
|
|
||||||
help='Choose device to execute model inference.')
|
|
||||||
|
|
||||||
def _get_default_cfg_path(self):
|
|
||||||
"""
|
|
||||||
Returns a default config file path of current task.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None):
|
|
||||||
"""
|
|
||||||
Init model from a specific config file.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def preprocess(self, input: Union[str, os.PathLike]):
|
|
||||||
"""
|
|
||||||
Input preprocess and return paddle.Tensor stored in self.input.
|
|
||||||
Input content can be a text(t2s), a file(s2t, cls) or a streaming(not supported yet).
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
@paddle.no_grad()
|
|
||||||
def infer(self):
|
|
||||||
"""
|
|
||||||
Model inference and result stored in self.output.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def postprocess(self) -> Union[str, os.PathLike]:
|
|
||||||
"""
|
|
||||||
Output postprocess and return human-readable results such as texts and audio files.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def execute(self, argv: List[str]) -> bool:
|
|
||||||
parser_args = self.parser.parse_args(argv)
|
|
||||||
print(parser_args)
|
|
||||||
|
|
||||||
config = parser_args.config
|
|
||||||
audio_file = parser_args.input
|
|
||||||
device = parser_args.device
|
|
||||||
|
|
||||||
if config is not None:
|
|
||||||
assert os.path.isfile(config), 'Config file is not valid.'
|
|
||||||
else:
|
|
||||||
config = self._get_default_cfg_path()
|
|
||||||
|
|
||||||
try:
|
|
||||||
self._init_from_cfg(config)
|
|
||||||
self.preprocess(audio_file)
|
|
||||||
self.infer()
|
|
||||||
res = self.postprocess() # Retrieve result of s2t.
|
|
||||||
print(res)
|
|
||||||
return True
|
|
||||||
except Exception as e:
|
|
||||||
print(e)
|
|
||||||
return False
|
|
@ -1,20 +0,0 @@
|
|||||||
# Install conda dependencies
|
|
||||||
conda install -c conda-forge sox libsndfile swig bzip2 bottleneck gcc_linux-64=8.4.0 gxx_linux-64=8.4.0 --yes
|
|
||||||
|
|
||||||
# Install the python lib
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
# Install the auto_log
|
|
||||||
pushd tools/extras
|
|
||||||
bash install_autolog.sh
|
|
||||||
popd
|
|
||||||
|
|
||||||
# Install the ctcdecoder
|
|
||||||
pushd paddlespeech/s2t/decoders/ctcdecoder/swig
|
|
||||||
bash -e setup.sh
|
|
||||||
popd
|
|
||||||
|
|
||||||
# Install the python_speech_features
|
|
||||||
pushd third_party
|
|
||||||
bash -e install.sh
|
|
||||||
popd
|
|
@ -0,0 +1,152 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# encoding: utf-8
|
||||||
|
# Copyright 2018 Nagoya University (Tomoki Hayashi)
|
||||||
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
from __future__ import print_function
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from distutils.util import strtobool
|
||||||
|
|
||||||
|
from espnet.utils.cli_utils import get_commandline_args
|
||||||
|
|
||||||
|
is_python2 = sys.version_info[0] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="add multiple json values to an input or output value",
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
|
||||||
|
parser.add_argument("jsons", type=str, nargs="+", help="json files")
|
||||||
|
parser.add_argument(
|
||||||
|
"-i",
|
||||||
|
"--is-input",
|
||||||
|
default=True,
|
||||||
|
type=strtobool,
|
||||||
|
help="If true, add to input. If false, add to output", )
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose", "-V", default=0, type=int, help="Verbose option")
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = get_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# logging info
|
||||||
|
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||||
|
if args.verbose > 0:
|
||||||
|
logging.basicConfig(level=logging.INFO, format=logfmt)
|
||||||
|
else:
|
||||||
|
logging.basicConfig(level=logging.WARN, format=logfmt)
|
||||||
|
logging.info(get_commandline_args())
|
||||||
|
|
||||||
|
# make intersection set for utterance keys
|
||||||
|
js = []
|
||||||
|
intersec_ks = []
|
||||||
|
for x in args.jsons:
|
||||||
|
with codecs.open(x, "r", encoding="utf-8") as f:
|
||||||
|
j = json.load(f)
|
||||||
|
ks = j["utts"].keys()
|
||||||
|
logging.info(x + ": has " + str(len(ks)) + " utterances")
|
||||||
|
if len(intersec_ks) > 0:
|
||||||
|
intersec_ks = intersec_ks.intersection(set(ks))
|
||||||
|
if len(intersec_ks) == 0:
|
||||||
|
logging.warning("Empty intersection")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
intersec_ks = set(ks)
|
||||||
|
js.append(j)
|
||||||
|
logging.info("new json has " + str(len(intersec_ks)) + " utterances")
|
||||||
|
|
||||||
|
# updated original dict to keep intersection
|
||||||
|
intersec_org_dic = dict()
|
||||||
|
for k in intersec_ks:
|
||||||
|
v = js[0]["utts"][k]
|
||||||
|
intersec_org_dic[k] = v
|
||||||
|
|
||||||
|
intersec_add_dic = dict()
|
||||||
|
for k in intersec_ks:
|
||||||
|
v = js[1]["utts"][k]
|
||||||
|
for j in js[2:]:
|
||||||
|
v.update(j["utts"][k])
|
||||||
|
intersec_add_dic[k] = v
|
||||||
|
|
||||||
|
new_dic = dict()
|
||||||
|
for key_id in intersec_org_dic:
|
||||||
|
orgdic = intersec_org_dic[key_id]
|
||||||
|
adddic = intersec_add_dic[key_id]
|
||||||
|
|
||||||
|
if "utt2spk" not in orgdic:
|
||||||
|
orgdic["utt2spk"] = ""
|
||||||
|
# NOTE: for machine translation
|
||||||
|
|
||||||
|
# add as input
|
||||||
|
if args.is_input:
|
||||||
|
# original input
|
||||||
|
input_list = orgdic["input"]
|
||||||
|
# additional input
|
||||||
|
in_add_dic = {}
|
||||||
|
if "idim" in adddic and "ilen" in adddic:
|
||||||
|
in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])]
|
||||||
|
elif "idim" in adddic:
|
||||||
|
in_add_dic["shape"] = [int(adddic["idim"])]
|
||||||
|
# add all other key value
|
||||||
|
for key, value in adddic.items():
|
||||||
|
if key in ["idim", "ilen"]:
|
||||||
|
continue
|
||||||
|
in_add_dic[key] = value
|
||||||
|
# add name
|
||||||
|
in_add_dic["name"] = "input%d" % (len(input_list) + 1)
|
||||||
|
|
||||||
|
input_list.append(in_add_dic)
|
||||||
|
new_dic[key_id] = {
|
||||||
|
"input": input_list,
|
||||||
|
"output": orgdic["output"],
|
||||||
|
"utt2spk": orgdic["utt2spk"],
|
||||||
|
}
|
||||||
|
# add as output
|
||||||
|
else:
|
||||||
|
# original output
|
||||||
|
output_list = orgdic["output"]
|
||||||
|
# additional output
|
||||||
|
out_add_dic = {}
|
||||||
|
# add shape
|
||||||
|
if "odim" in adddic and "olen" in adddic:
|
||||||
|
out_add_dic[
|
||||||
|
"shape"] = [int(adddic["olen"]), int(adddic["odim"])]
|
||||||
|
elif "odim" in adddic:
|
||||||
|
out_add_dic["shape"] = [int(adddic["odim"])]
|
||||||
|
# add all other key value
|
||||||
|
for key, value in adddic.items():
|
||||||
|
if key in ["odim", "olen"]:
|
||||||
|
continue
|
||||||
|
out_add_dic[key] = value
|
||||||
|
# add name
|
||||||
|
out_add_dic["name"] = "target%d" % (len(output_list) + 1)
|
||||||
|
|
||||||
|
output_list.append(out_add_dic)
|
||||||
|
new_dic[key_id] = {
|
||||||
|
"input": orgdic["input"],
|
||||||
|
"output": output_list,
|
||||||
|
"utt2spk": orgdic["utt2spk"],
|
||||||
|
}
|
||||||
|
if "lang" in orgdic.keys():
|
||||||
|
new_dic[key_id]["lang"] = orgdic["lang"]
|
||||||
|
|
||||||
|
# ensure "ensure_ascii=False", which is a bug
|
||||||
|
jsonstring = json.dumps(
|
||||||
|
{
|
||||||
|
"utts": new_dic
|
||||||
|
},
|
||||||
|
indent=4,
|
||||||
|
ensure_ascii=False,
|
||||||
|
sort_keys=True,
|
||||||
|
separators=(",", ": "), )
|
||||||
|
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
|
||||||
|
if is_python2 else sys.stdout.buffer)
|
||||||
|
print(jsonstring)
|
@ -0,0 +1,49 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# encoding: utf-8
|
||||||
|
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
|
||||||
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
from __future__ import print_function
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import codecs
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
is_python2 = sys.version_info[0] == 2
|
||||||
|
|
||||||
|
|
||||||
|
def get_parser():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="convert scp to json",
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter, )
|
||||||
|
parser.add_argument("--key", "-k", type=str, help="key")
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = get_parser()
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
new_line = {}
|
||||||
|
sys.stdin = codecs.getreader("utf-8")(sys.stdin
|
||||||
|
if is_python2 else sys.stdin.buffer)
|
||||||
|
sys.stdout = codecs.getwriter("utf-8")(sys.stdout
|
||||||
|
if is_python2 else sys.stdout.buffer)
|
||||||
|
line = sys.stdin.readline()
|
||||||
|
while line:
|
||||||
|
x = line.rstrip().split()
|
||||||
|
v = {args.key: " ".join(x[1:])}
|
||||||
|
new_line[x[0]] = v
|
||||||
|
line = sys.stdin.readline()
|
||||||
|
|
||||||
|
all_l = {"utts": new_line}
|
||||||
|
|
||||||
|
# ensure "ensure_ascii=False", which is a bug
|
||||||
|
jsonstring = json.dumps(
|
||||||
|
all_l,
|
||||||
|
indent=4,
|
||||||
|
ensure_ascii=False,
|
||||||
|
sort_keys=True,
|
||||||
|
separators=(",", ": "))
|
||||||
|
print(jsonstring)
|
@ -0,0 +1,88 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright 2020 Kyoto University (Hirofumi Inaguma)
|
||||||
|
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
|
||||||
|
|
||||||
|
echo "$0 $*" >&2 # Print the command line for logging
|
||||||
|
. ./path.sh
|
||||||
|
|
||||||
|
nlsyms=""
|
||||||
|
oov="<unk>"
|
||||||
|
bpecode=""
|
||||||
|
verbose=0
|
||||||
|
|
||||||
|
text=""
|
||||||
|
multilingual=false
|
||||||
|
|
||||||
|
help_message=$(cat << EOF
|
||||||
|
Usage: $0 <json> <data-dir> <dict>
|
||||||
|
e.g. $0 data/train data/lang_1char/train_units.txt
|
||||||
|
Options:
|
||||||
|
--oov <oov-word> # Default: <unk>
|
||||||
|
--verbose <num> # Default: 0
|
||||||
|
EOF
|
||||||
|
)
|
||||||
|
. utils/parse_options.sh
|
||||||
|
|
||||||
|
if [ $# != 3 ]; then
|
||||||
|
echo "${help_message}" 1>&2
|
||||||
|
exit 1;
|
||||||
|
fi
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
json=$1
|
||||||
|
dir=$2
|
||||||
|
dic=$3
|
||||||
|
json_dir=$(dirname ${json})
|
||||||
|
tmpdir=$(mktemp -d ${dir}/tmp-XXXXX)
|
||||||
|
trap 'rm -rf ${tmpdir}' EXIT
|
||||||
|
|
||||||
|
if [ -z ${text} ]; then
|
||||||
|
text=${dir}/text
|
||||||
|
fi
|
||||||
|
|
||||||
|
# 2. Create scp files for outputs
|
||||||
|
mkdir -p ${tmpdir}/output
|
||||||
|
if [ -n "${bpecode}" ]; then
|
||||||
|
if [ ${multilingual} = true ]; then
|
||||||
|
# remove a space before the language ID
|
||||||
|
paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
|
||||||
|
| spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \
|
||||||
|
> ${tmpdir}/output/token.scp
|
||||||
|
else
|
||||||
|
paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \
|
||||||
|
| spm_encode --model=${bpecode} --output_format=piece) \
|
||||||
|
> ${tmpdir}/output/token.scp
|
||||||
|
fi
|
||||||
|
elif [ -n "${nlsyms}" ]; then
|
||||||
|
text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp
|
||||||
|
else
|
||||||
|
text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp
|
||||||
|
fi
|
||||||
|
< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp
|
||||||
|
awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp
|
||||||
|
# +2 comes from CTC blank and EOS
|
||||||
|
vocsize=$(tail -n 1 ${dic} | awk '{print $2}')
|
||||||
|
odim=$(echo "$vocsize + 2" | bc)
|
||||||
|
awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp
|
||||||
|
|
||||||
|
cat ${text} > ${tmpdir}/output/text.scp
|
||||||
|
|
||||||
|
|
||||||
|
# 4. Create JSON files from each scp files
|
||||||
|
rm -f ${tmpdir}/*/*.json
|
||||||
|
for x in "${tmpdir}"/output/*.scp; do
|
||||||
|
k=$(basename ${x} .scp)
|
||||||
|
< ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json
|
||||||
|
done
|
||||||
|
|
||||||
|
# add to json
|
||||||
|
addjson.py --verbose ${verbose} -i false \
|
||||||
|
${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json
|
||||||
|
mkdir -p ${json_dir}/.backup
|
||||||
|
echo "json updated. original json is kept in ${json_dir}/.backup."
|
||||||
|
cp ${json} ${json_dir}/.backup/"$(basename ${json})"
|
||||||
|
cp ${tmpdir}/data.json ${json}
|
||||||
|
|
||||||
|
rm -fr ${tmpdir}
|
Loading…
Reference in new issue