From e9798498d686e568d4d3488952f8cd2abec9a05f Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Mon, 29 Nov 2021 18:01:39 +0800 Subject: [PATCH 01/53] Update asr inference in paddlespeech.cli. --- paddlespeech/cli/executor.py | 9 +-- paddlespeech/cli/s2t/conf/default_conf.yaml | 0 paddlespeech/cli/s2t/infer.py | 67 +++++++++++++--- paddlespeech/cli/utils.py | 86 ++++++++++++++++++--- 4 files changed, 136 insertions(+), 26 deletions(-) delete mode 100644 paddlespeech/cli/s2t/conf/default_conf.yaml diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index 45472fa4b..2261e011b 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -14,7 +14,6 @@ import os from abc import ABC from abc import abstractmethod -from typing import Optional from typing import Union import paddle @@ -30,16 +29,16 @@ class BaseExecutor(ABC): self.output = None @abstractmethod - def _get_default_cfg_path(self): + def _get_pretrained_path(self, tag: str) -> os.PathLike: """ - Returns a default config file path of current task. + Download and returns pretrained resources path of current task. """ pass @abstractmethod - def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None): + def _init_from_path(self, *args, **kwargs): """ - Init model from a specific config file. + Init model and other resources from a specific path. """ pass diff --git a/paddlespeech/cli/s2t/conf/default_conf.yaml b/paddlespeech/cli/s2t/conf/default_conf.yaml deleted file mode 100644 index e69de29bb..000000000 diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/s2t/infer.py index 682279852..6aa29addf 100644 --- a/paddlespeech/cli/s2t/infer.py +++ b/paddlespeech/cli/s2t/infer.py @@ -21,9 +21,21 @@ import paddle from ..executor import BaseExecutor from ..utils import cli_register +from ..utils import download_and_decompress +from ..utils import logger +from ..utils import MODEL_HOME __all__ = ['S2TExecutor'] +pretrained_models = { + "wenetspeech_zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz', + 'md5': + '54e7a558a6e020c2f5fb224874943f97', + } +} + @cli_register( name='paddlespeech.s2t', description='Speech to text infer command.') @@ -33,11 +45,23 @@ class S2TExecutor(BaseExecutor): self.parser = argparse.ArgumentParser( prog='paddlespeech.s2t', add_help=True) + self.parser.add_argument( + '--model', + type=str, + default='wenetspeech', + help='Choose model type of asr task.') + self.parser.add_argument( + '--lang', type=str, default='zh', help='Choose model language.') self.parser.add_argument( '--config', type=str, default=None, help='Config of s2t task. Use deault config when it is None.') + self.parser.add_argument( + '--ckpt_path', + type=str, + default=None, + help='Checkpoint file of model.') self.parser.add_argument( '--input', type=str, help='Audio file to recognize.') self.parser.add_argument( @@ -46,16 +70,39 @@ class S2TExecutor(BaseExecutor): default='cpu', help='Choose device to execute model inference.') - def _get_default_cfg_path(self): + def _get_pretrained_path(self, tag: str) -> os.PathLike: """ - Returns a default config file path of current task. + Download and returns pretrained resources path of current task. """ - pass + assert tag in pretrained_models, 'Can not find pretrained resources of {}.'.format( + tag) + + res_path = os.path.join(MODEL_HOME, tag) + decompressed_path = download_and_decompress(pretrained_models[tag], + res_path) + logger.info( + 'Use pretrained model stored in: {}'.format(decompressed_path)) + return decompressed_path - def _init_from_cfg(self, cfg_path: Optional[os.PathLike]=None): + def _init_from_path(self, + model_type: str='wenetspeech', + lang: str='zh', + cfg_path: Optional[os.PathLike]=None, + ckpt_path: Optional[os.PathLike]=None): """ - Init model from a specific config file. + Init model and other resources from a specific path. """ + if cfg_path is None or ckpt_path is None: + res_path = self._get_pretrained_path( + model_type + '_' + lang) # wenetspeech_zh + cfg_path = os.path.join(res_path, 'conf/conformer.yaml') + ckpt_path = os.path.join( + res_path, 'exp/conformer/checkpoints/wenetspeech.pdparams') + logger.info(res_path) + logger.info(cfg_path) + logger.info(ckpt_path) + + # Init body. pass def preprocess(self, input: Union[str, os.PathLike]): @@ -82,17 +129,15 @@ class S2TExecutor(BaseExecutor): parser_args = self.parser.parse_args(argv) print(parser_args) + model = parser_args.model + lang = parser_args.lang config = parser_args.config + ckpt_path = parser_args.ckpt_path audio_file = parser_args.input device = parser_args.device - if config is not None: - assert os.path.isfile(config), 'Config file is not valid.' - else: - config = self._get_default_cfg_path() - try: - self._init_from_cfg(config) + self._init_from_path(model, lang, config, ckpt_path) self.preprocess(audio_file) self.infer() res = self.postprocess() # Retrieve result of s2t. diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py index c83deee89..edf579f71 100644 --- a/paddlespeech/cli/utils.py +++ b/paddlespeech/cli/utils.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import functools +import logging import os from typing import Any from typing import Dict -from typing import List from paddle.framework import load from paddle.utils import download @@ -26,6 +27,7 @@ __all__ = [ 'get_command', 'download_and_decompress', 'load_state_dict_from_url', + 'logger', ] @@ -53,29 +55,27 @@ def get_command(name: str) -> Any: return com['_entry'] -def decompress(file: str): +def decompress(file: str) -> os.PathLike: """ Extracts all files from a compressed file. """ assert os.path.isfile(file), "File: {} not exists.".format(file) - download._decompress(file) + return download._decompress(file) -def download_and_decompress(archives: List[Dict[str, str]], path: str): +def download_and_decompress(archive: Dict[str, str], path: str) -> os.PathLike: """ Download archieves and decompress to specific path. """ if not os.path.isdir(path): os.makedirs(path) - for archive in archives: - assert 'url' in archive and 'md5' in archive, \ - 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' + assert 'url' in archive and 'md5' in archive, \ + 'Dictionary keys of "url" and "md5" are required in the archive, but got: {}'.format(list(archive.keys())) + return download.get_path_from_url(archive['url'], path, archive['md5']) - download.get_path_from_url(archive['url'], path, archive['md5']) - -def load_state_dict_from_url(url: str, path: str, md5: str=None): +def load_state_dict_from_url(url: str, path: str, md5: str=None) -> os.PathLike: """ Download and load a state dict from url """ @@ -84,3 +84,69 @@ def load_state_dict_from_url(url: str, path: str, md5: str=None): download.get_path_from_url(url, path, md5) return load(os.path.join(path, os.path.basename(url))) + + +def _get_user_home(): + return os.path.expanduser('~') + + +def _get_paddlespcceh_home(): + if 'PPSPEECH_HOME' in os.environ: + home_path = os.environ['PPSPEECH_HOME'] + if os.path.exists(home_path): + if os.path.isdir(home_path): + return home_path + else: + raise RuntimeError( + 'The environment variable PPSPEECH_HOME {} is not a directory.'. + format(home_path)) + else: + return home_path + return os.path.join(_get_user_home(), '.paddlespeech') + + +def _get_sub_home(directory): + home = os.path.join(_get_paddlespcceh_home(), directory) + if not os.path.exists(home): + os.makedirs(home) + return home + + +PPSPEECH_HOME = _get_paddlespcceh_home() +MODEL_HOME = _get_sub_home('models') + + +class Logger(object): + def __init__(self, name: str=None): + name = 'PaddleSpeech' if not name else name + self.logger = logging.getLogger(name) + + log_config = { + 'DEBUG': 10, + 'INFO': 20, + 'TRAIN': 21, + 'EVAL': 22, + 'WARNING': 30, + 'ERROR': 40, + 'CRITICAL': 50 + } + for key, level in log_config.items(): + logging.addLevelName(level, key) + self.__dict__[key.lower()] = functools.partial(self.__call__, level) + + self.format = logging.Formatter( + fmt='[%(asctime)-15s] [%(levelname)8s] [%(filename)s] [L%(lineno)d] - %(message)s' + ) + + self.handler = logging.StreamHandler() + self.handler.setFormatter(self.format) + + self.logger.addHandler(self.handler) + self.logger.setLevel(logging.DEBUG) + self.logger.propagate = False + + def __call__(self, log_level: str, msg: str): + self.logger.log(log_level, msg) + + +logger = Logger() From 383b68d8f47f15c86ea1f9bdce90fe39d8ee3b58 Mon Sep 17 00:00:00 2001 From: Junkun Date: Thu, 25 Nov 2021 21:20:03 -0800 Subject: [PATCH 02/53] minor --- dataset/ted_en_zh/ted_en_zh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset/ted_en_zh/ted_en_zh.py b/dataset/ted_en_zh/ted_en_zh.py index 9a3ba3b31..2d1fc6710 100644 --- a/dataset/ted_en_zh/ted_en_zh.py +++ b/dataset/ted_en_zh/ted_en_zh.py @@ -28,7 +28,7 @@ import soundfile parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( - "--src_dir", + "--src-dir", default="", type=str, help="Directory to kaldi splited data. (default: %(default)s)") From 6a50211c8042ce15392f37403edb54e59dd9a568 Mon Sep 17 00:00:00 2001 From: Junkun Date: Thu, 25 Nov 2021 21:20:37 -0800 Subject: [PATCH 03/53] data process for ted-en-zh st1 --- examples/ted_en_zh/st1/local/data.sh | 214 +++++++++++++++++++-------- examples/ted_en_zh/st1/path.sh | 10 +- 2 files changed, 161 insertions(+), 63 deletions(-) diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index aa958cfde..72d141e7d 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -2,16 +2,18 @@ set -e -stage=-1 +stage=1 stop_stage=100 dict_dir=data/lang_char # bpemode (unigram or bpe) nbpe=8000 -bpemode=unigram +bpemode=bpe bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}" data_dir=./TED_EnZh - +target_dir=data/ted_en_zh +dumpdir=data/dump +do_delta=false source ${MAIN_ROOT}/utils/parse_options.sh @@ -38,75 +40,163 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - # generate manifests - python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \ - --manifest_prefix="data/manifest" \ - --src_dir="${data_dir}" + # # extract data + # echo "data Extraction" + # python3 local/ted_en_zh.py \ + # --tgt-dir=${target_dir} \ + # --src-dir=${data_dir} - echo "Complete raw data pre-process." fi - +prep_dir=${target_dir}/data_prep if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # compute mean and stddev for normalizer - num_workers=$(nproc) - python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ - --manifest_path="data/manifest.train.raw" \ - --num_samples=-1 \ - --spectrum_type="fbank" \ - --feat_dim=80 \ - --delta_delta=false \ - --sample_rate=16000 \ - --stride_ms=10.0 \ - --window_ms=25.0 \ - --use_dB_normalization=False \ - --num_workers=${num_workers} \ - --output_path="data/mean_std.json" - - if [ $? -ne 0 ]; then - echo "Compute mean and stddev failed. Terminated." - exit 1 - fi + ### Task dependent. You have to make data the following preparation part by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 0: Data preparation" + for set in train dev test; do + # for set in train; do + dst=${target_dir}/${set} + for lang in en zh; do + + if [ ${lang} = 'en' ]; then + echo "remove punctuation $lang" + # remove punctuation + local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw + else + cp ${dst}/${lang}.org ${dst}/${lang}.raw + fi + + paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} + + + done + # error check + n=$(cat ${dst}/.yaml | wc -l) + n_en=$(cat ${dst}/en.raw | wc -l) + n_tgt=$(cat ${dst}/zh.raw | wc -l) + [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; + [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; + + echo "done text processing" + cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp + cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk + + cat ${dst}/utt2spk | utils/utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt + rm -rf ${prep_dir}/${set}.en-zh + mkdir -p ${prep_dir}/${set}.en-zh + echo "remove duplicate lines..." + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ + | sed 's/^[ \t]*//' > ${dst}/duplicate_lines + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ + | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist + reduce_data_dir.sh ${dst} ${dst}/reclist ${prep_dir}/${set}.en-zh + echo "done wav processing" + for l in en zh; do + cp ${dst}/text.${l} ${prep_dir}/${set}.en-zh/text.${l} + done + utils/fix_data_dir.sh --utt_extra_files \ + "text.en text.zh" \ + ${prep_dir}/${set}.en-zh + done fi +feat_tr_dir=${dumpdir}/train/delta${do_delta}; mkdir -p ${feat_tr_dir} +feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir} +feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # build vocabulary - python3 ${MAIN_ROOT}/utils/build_vocab.py \ - --unit_type "spm" \ - --spm_vocab_size=${nbpe} \ - --spm_mode ${bpemode} \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="${dict_dir}/vocab.txt" \ - --text_keys 'text' 'text1' \ - --manifest_paths="data/manifest.train.raw" - - - if [ $? -ne 0 ]; then - echo "Build vocabulary failed. Terminated." - exit 1 - fi + ### Task dependent. You have to design training and dev sets by yourself. + ### But you can utilize Kaldi recipes in most cases + echo "stage 1: Feature Generation" + fbankdir=data/fbank + # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame + for x in train dev test; do + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir} + done + + echo "speed perturbation" + utils/perturb_data_dir_speed.sh 0.9 ${prep_dir}/train.en-zh ${prep_dir}/temp1.en-zh + utils/perturb_data_dir_speed.sh 1.0 ${prep_dir}/train.en-zh ${prep_dir}/temp2.en-zh + utils/perturb_data_dir_speed.sh 1.1 ${prep_dir}/train.en-zh ${prep_dir}/temp3.en-zh + + utils/combine_data.sh --extra-files utt2uniq ${prep_dir}/train_sp.en-zh \ + ${prep_dir}/temp1.en-zh ${prep_dir}/temp2.en-zh ${prep_dir}/temp3.en-zh + rm -r ${prep_dir}/temp*.en-zh + utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh + + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir} + + for lang in en zh; do + cat /dev/null > ${prep_dir}/train_sp.en-zh/text.${lang} + for p in "sp0.9-" "sp1.0-" "sp1.1-"; do + awk -v p=${p} '{printf("%s %s%s\n", $1, p, $1);}' ${prep_dir}/train.en-zh/utt2spk > ${prep_dir}/train_sp.en-zh/utt_map + utils/apply_map.pl -f 1 ${prep_dir}/train_sp.en-zh/utt_map < ${prep_dir}/train.en-zh/text.${lang} >>${prep_dir}/train_sp.en-zh/text.${lang} + done + done + + for x in train_sp dev test; do + local/divide_lang.sh ${prep_dir}/${x}.en-zh zh + done + + for x in train_sp dev; do + # remove utt having more than 3000 frames + # remove utt having more than 400 characters + for lang in zh en; do + remove_longshortdata.sh --maxframes 3000 --maxchars 400 ${prep_dir}/${x}.en-zh.${lang} ${prep_dir}/${x}.en-zh.${lang}.tmp + done + cut -f 1 -d " " ${prep_dir}/${x}.en-zh.en.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 + cut -f 1 -d " " ${prep_dir}/${x}.en-zh.${lang}.tmp/text > ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 + comm -12 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist1 ${prep_dir}/${x}.en-zh.${lang}.tmp/reclist2 > ${prep_dir}/${x}.en-zh.en.tmp/reclist + + for lang in zh en; do + reduce_data_dir.sh ${prep_dir}/${x}.en-zh.${lang}.tmp ${prep_dir}/${x}.en-zh.en.tmp/reclist ${prep_dir}/${x}.en-zh.${lang} + utils/fix_data_dir.sh ${prep_dir}/${x}.en-zh.${lang} + done + rm -rf ${prep_dir}/${x}.en-zh.*.tmp + done + + compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark + + dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \ + ${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh ${feat_tr_dir} + dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ + ${prep_dir}/dev.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh ${feat_dt_dir} + dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ + ${prep_dir}/test.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh ${feat_trans_dir} fi +dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt +nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt +bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe} if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # format manifest with tokenids, vocab size - for set in train dev test; do - { - python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ - --feat_type "raw" \ - --cmvn_path "data/mean_std.json" \ - --unit_type "spm" \ - --spm_model_prefix ${bpeprefix} \ - --vocab_path="${dict_dir}/vocab.txt" \ - --manifest_path="data/manifest.${set}.raw" \ - --output_path="data/manifest.${set}" - - if [ $? -ne 0 ]; then - echo "Formt mnaifest failed. Terminated." - exit 1 - fi - }& + echo "stage 2: Dictionary and Json Data Preparation" + # echo "make a non-linguistic symbol list for all languages" + # grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms} + # cat ${nlsyms} + + echo "make a joint source and target dictionary" + echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC + offset=$(wc -l < ${dict}) + grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -v -e '^\s*$' > ${dict_dir}/input.txt + spm_train --input=${dict_dir}/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --character_coverage=1.0 + spm_encode --model=${bpemodel}.model --output_format=piece < ${dict_dir}/input.txt | tr ' ' '\n' | sort | uniq | awk -v offset=${offset} '{print $0 " " NR+offset}' >> ${dict} + wc -l ${dict} + + echo "make json files" + data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json + data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json + data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json + echo "update json (add source references)" + # update json (add source references) + for x in ${train_set} ${train_dev}; do + feat_dir=${dumpdir}/${x}/delta${do_delta} + data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-zh.en + update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \ + ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict} done - wait fi - echo "Ted En-Zh Data preparation done." exit 0 diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh index fd537917a..ee4c9779f 100644 --- a/examples/ted_en_zh/st1/path.sh +++ b/examples/ted_en_zh/st1/path.sh @@ -1,6 +1,6 @@ export MAIN_ROOT=`realpath ${PWD}/../../../` -export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PWD}/utils:${PATH} export LC_ALL=C export PYTHONDONTWRITEBYTECODE=1 @@ -13,3 +13,11 @@ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ MODEL=u2_st export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin + +# Kaldi +export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh +export train_cmd="run.pl" \ No newline at end of file From cdd084512783303b9c606dc4c4e0aa739e6b8c3e Mon Sep 17 00:00:00 2001 From: Junkun Date: Sun, 28 Nov 2021 22:59:37 -0800 Subject: [PATCH 04/53] add translate function --- paddlespeech/s2t/exps/u2_st/model.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py index 52d3c3b7d..034463fea 100644 --- a/paddlespeech/s2t/exps/u2_st/model.py +++ b/paddlespeech/s2t/exps/u2_st/model.py @@ -26,8 +26,10 @@ from paddle import distributed as dist from paddle.io import DataLoader from yacs.config import CfgNode +from paddlespeech.s2t.frontend.featurizer import TextFeaturizer from paddlespeech.s2t.io.collator import SpeechCollator from paddlespeech.s2t.io.collator import TripletSpeechCollator +from paddlespeech.s2t.io.dataloader import BatchDataLoader from paddlespeech.s2t.io.dataset import ManifestDataset from paddlespeech.s2t.io.sampler import SortagradBatchSampler from paddlespeech.s2t.io.sampler import SortagradDistributedBatchSampler @@ -423,6 +425,30 @@ class U2STTester(U2STTrainer): trans.append(''.join([chr(i) for i in ids])) return trans + def translate(self, audio, audio_len): + """"E2E translation from extracted audio feature""" + cfg = self.config.decoding + text_feature = self.test_loader.collate_fn.text_feature + + hyps = self.model.decode( + audio, + audio_len, + text_feature=text_feature, + decoding_method=cfg.decoding_method, + lang_model_path=cfg.lang_model_path, + beam_alpha=cfg.alpha, + beam_beta=cfg.beta, + beam_size=cfg.beam_size, + cutoff_prob=cfg.cutoff_prob, + cutoff_top_n=cfg.cutoff_top_n, + num_processes=cfg.num_proc_bsearch, + ctc_weight=cfg.ctc_weight, + word_reward=cfg.word_reward, + decoding_chunk_size=cfg.decoding_chunk_size, + num_decoding_left_chunks=cfg.num_decoding_left_chunks, + simulate_streaming=cfg.simulate_streaming) + return hyps + def compute_translation_metrics(self, utts, audio, From 8f3280af8e73c90b148a94800948e4dc7273696a Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 11:30:18 -0800 Subject: [PATCH 05/53] fix data process --- examples/ted_en_zh/st1/local/data.sh | 53 +++++++++--------- examples/ted_en_zh/st1/local/data_prep.sh | 54 +++++++++++++++++++ examples/ted_en_zh/st1/local/divide_lang.sh | 48 +++++++++++++++++ .../st1/local/espnet_json_to_manifest.py | 27 ++++++++++ .../ted_en_zh/st1/local/remove_punctuation.pl | 25 +++++++++ 5 files changed, 183 insertions(+), 24 deletions(-) create mode 100755 examples/ted_en_zh/st1/local/data_prep.sh create mode 100755 examples/ted_en_zh/st1/local/divide_lang.sh create mode 100644 examples/ted_en_zh/st1/local/espnet_json_to_manifest.py create mode 100755 examples/ted_en_zh/st1/local/remove_punctuation.pl diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index 72d141e7d..8b829a8a1 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -2,7 +2,7 @@ set -e -stage=1 +stage=3 stop_stage=100 dict_dir=data/lang_char @@ -14,6 +14,7 @@ data_dir=./TED_EnZh target_dir=data/ted_en_zh dumpdir=data/dump do_delta=false +nj=20 source ${MAIN_ROOT}/utils/parse_options.sh @@ -40,11 +41,11 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then exit 1 fi - # # extract data - # echo "data Extraction" - # python3 local/ted_en_zh.py \ - # --tgt-dir=${target_dir} \ - # --src-dir=${data_dir} + # extract data + echo "data Extraction" + python3 local/ted_en_zh.py \ + --tgt-dir=${target_dir} \ + --src-dir=${data_dir} fi prep_dir=${target_dir}/data_prep @@ -99,7 +100,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then done fi -feat_tr_dir=${dumpdir}/train/delta${do_delta}; mkdir -p ${feat_tr_dir} +feat_tr_dir=${dumpdir}/train_sp/delta${do_delta}; mkdir -p ${feat_tr_dir} feat_dt_dir=${dumpdir}/dev/delta${do_delta}; mkdir -p ${feat_dt_dir} feat_trans_dir=${dumpdir}/test/delta${do_delta}; mkdir -p ${feat_trans_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then @@ -109,7 +110,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then fbankdir=data/fbank # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame for x in train dev test; do - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \ ${prep_dir}/${x}.en-zh data/make_fbank/${x} ${fbankdir} done @@ -123,7 +124,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then rm -r ${prep_dir}/temp*.en-zh utils/fix_data_dir.sh ${prep_dir}/train_sp.en-zh - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj 32 --write_utt2num_frames true \ + steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \ ${prep_dir}/train_sp.en-zh exp/make_fbank/train_sp.en-zh ${fbankdir} for lang in en zh; do @@ -155,14 +156,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then rm -rf ${prep_dir}/${x}.en-zh.*.tmp done - compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark + compute-cmvn-stats scp:${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark - dump.sh --cmd "$train_cmd" --nj 80 --do_delta $do_delta \ - ${prep_dir}/train_sp.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh ${feat_tr_dir} - dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ - ${prep_dir}/dev.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh ${feat_dt_dir} - dump.sh --cmd "$train_cmd" --nj 32 --do_delta $do_delta \ - ${prep_dir}/test.en-zh/feats.scp ${prep_dir}/train_sp.en-zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh ${feat_trans_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/train_sp.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/train_sp.en-zh.zh ${feat_tr_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/dev.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/dev.en-zh.zh ${feat_dt_dir} + dump.sh --cmd "$train_cmd" --nj ${nj} --do_delta $do_delta \ + ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir} fi dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt @@ -170,9 +171,6 @@ nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe} if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "stage 2: Dictionary and Json Data Preparation" - # echo "make a non-linguistic symbol list for all languages" - # grep sp1.0 ${prep_dir}/train_sp.en-zh.*/text | cut -f 2- -d' ' | grep -o -P '&[^;];'| sort | uniq > ${nlsyms} - # cat ${nlsyms} echo "make a joint source and target dictionary" echo " 1" > ${dict} # must be 1, 0 will be used for "blank" in CTC @@ -183,20 +181,27 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then wc -l ${dict} echo "make json files" - data2json.sh --nj 16 --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + data2json.sh --nj ${nj} --feat ${feat_tr_dir}/feats.scp --text ${prep_dir}/train_sp.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/train_sp.en-zh.zh ${dict} > ${feat_tr_dir}/data_${bpemode}${nbpe}.json data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/dev.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/dev.en-zh.zh ${dict} > ${feat_dt_dir}/data_${bpemode}${nbpe}.json - data2json.sh --feat ${feat_dt_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ + data2json.sh --feat ${feat_trans_dir}/feats.scp --text ${prep_dir}/test.en-zh.zh/text --bpecode ${bpemodel}.model --lang zh \ ${prep_dir}/test.en-zh.zh ${dict} > ${feat_trans_dir}/data_${bpemode}${nbpe}.json echo "update json (add source references)" # update json (add source references) - for x in ${train_set} ${train_dev}; do + for x in train_sp dev; do feat_dir=${dumpdir}/${x}/delta${do_delta} - data_dir=data/$(echo ${x} | cut -f 1 -d ".").en-zh.en - update_json.sh --text ${data_dir}/text.${src_case} --bpecode ${bpemodel}.model \ + data_dir=${prep_dir}/$(echo ${x} | cut -f 1 -d ".").en-zh.en + update_json.sh --text ${data_dir}/text --bpecode ${bpemodel}.model \ ${feat_dir}/data_${bpemode}${nbpe}.json ${data_dir} ${dict} done fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "stage 3: Format the Json Data" + python3 local/espnet_json_to_manifest.py --json-file ${feat_tr_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.train + python3 local/espnet_json_to_manifest.py --json-file ${feat_dt_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.dev + python3 local/espnet_json_to_manifest.py --json-file ${feat_trans_dir}/data_${bpemode}${nbpe}.json --manifest-file data/manifest.test +fi echo "Ted En-Zh Data preparation done." exit 0 diff --git a/examples/ted_en_zh/st1/local/data_prep.sh b/examples/ted_en_zh/st1/local/data_prep.sh new file mode 100755 index 000000000..339cee1eb --- /dev/null +++ b/examples/ted_en_zh/st1/local/data_prep.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +# Copyright 2019 Kyoto University (Hirofumi Inaguma) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +export LC_ALL=C + +data_dir=${1} + +for set in train dev test; do +# for set in train; do + dst=${target_dir}/${set} + for lang in en zh; do + + if [ ${lang} = 'en' ]; then + echo "remove punctuation $lang" + # remove punctuation + local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw + else + cp ${dst}/${lang}.org ${dst}/${lang}.raw + fi + + paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} + + + done + # error check + n=$(cat ${dst}/.yaml | wc -l) + n_en=$(cat ${dst}/en.raw | wc -l) + n_tgt=$(cat ${dst}/zh.raw | wc -l) + [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; + [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; + + echo "done text processing" + cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp + cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk + + cat ${dst}/utt2spk | utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt + rm -rf ${target_dir}/data_prep/${set}.en-zh + mkdir -p ${target_dir}/data_prep/${set}.en-zh + echo "remove duplicate lines..." + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ + | sed 's/^[ \t]*//' > ${dst}/duplicate_lines + cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ + | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist + reduce_data_dir.sh ${dst} ${dst}/reclist ${target_dir}/data_prep/${set}.en-zh + echo "done wav processing" + for l in en zh; do + cp ${dst}/text.${l} ${target_dir}/data_prep/${set}.en-zh/text.${l} + done + fix_data_dir.sh --utt_extra_files \ + "text.en text.zh" \ + ${target_dir}/data_prep/${set}.en-zh +done \ No newline at end of file diff --git a/examples/ted_en_zh/st1/local/divide_lang.sh b/examples/ted_en_zh/st1/local/divide_lang.sh new file mode 100755 index 000000000..4e5f85c86 --- /dev/null +++ b/examples/ted_en_zh/st1/local/divide_lang.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright 2019 Kyoto University (Hirofumi Inaguma) +# 2021 PaddlePaddle +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +. ./path.sh + +if [ "$#" -ne 2 ]; then + echo "Usage: $0 >" + echo "e.g.: $0 dev" + exit 1 +fi + +set=$1 +lang=$2 +export LC_ALL=en_US.UTF-8 +# Copy stuff intoc its final locations [this has been moved from the format_data script] +# for En +mkdir -p ${set}.en +for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do + if [ -f ${set}/${f} ]; then + sort ${set}/${f} > ${set}.en/${f} + fi +done +sort ${set}/text.en | sed $'s/[^[:print:]]//g' > ${set}.en/text + +utils/fix_data_dir.sh ${set}.en +if [ -f ${set}.en/feats.scp ]; then + utils/validate_data_dir.sh ${set}.en || exit 1; +else + utils/validate_data_dir.sh --no-feats --no-wav ${set}.en || exit 1; +fi + +# for target language +mkdir -p ${set}.${lang} +for f in spk2utt utt2spk segments wav.scp feats.scp utt2num_frames; do + if [ -f ${set}/${f} ]; then + sort ${set}/${f} > ${set}.${lang}/${f} + fi +done +sort ${set}/text.${lang} | sed $'s/[^[:print:]]//g' > ${set}.${lang}/text +utils/fix_data_dir.sh ${set}.${lang} +if [ -f ${set}.${lang}/feats.scp ]; then + utils/validate_data_dir.sh ${set}.${lang} || exit 1; +else + utils/validate_data_dir.sh --no-feats --no-wav ${set}.${lang} || exit 1; +fi diff --git a/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py new file mode 100644 index 000000000..60d254367 --- /dev/null +++ b/examples/ted_en_zh/st1/local/espnet_json_to_manifest.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +import argparse +import json + + +def main(args): + with open(args.json_file, 'r') as fin: + data_json = json.load(fin) + + with open(args.manifest_file, 'w') as fout: + for key, value in data_json['utts'].items(): + value['utt'] = key + fout.write(json.dumps(value, ensure_ascii=False)) + fout.write("\n") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + '--json-file', type=str, default=None, help="espnet data json file.") + parser.add_argument( + '--manifest-file', + type=str, + default='manifest.train', + help='manifest data json line file.') + args = parser.parse_args() + main(args) diff --git a/examples/ted_en_zh/st1/local/remove_punctuation.pl b/examples/ted_en_zh/st1/local/remove_punctuation.pl new file mode 100755 index 000000000..89e19c6f4 --- /dev/null +++ b/examples/ted_en_zh/st1/local/remove_punctuation.pl @@ -0,0 +1,25 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); + +while() { + $_ = " $_ "; + + # remove punctuation except apostrophe + s//spacemark/g; # for scoring + s/'/apostrophe/g; + s/[[:punct:]]//g; + s/apostrophe/'/g; + s/spacemark//g; # for scoring + + # remove whitespace + s/\s+/ /g; + s/^\s+//; + s/\s+$//; + + print "$_\n"; +} From ea35558ee03527b57cfacccf272f405ca427d0b2 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 11:31:45 -0800 Subject: [PATCH 06/53] add utils --- utils/addjson.py | 155 +++++++++++ utils/scp2json.py | 48 ++++ utils/tokenizer.perl | 596 +++++++++++++++++++++++++++++++++++++++++++ utils/update_json.sh | 88 +++++++ 4 files changed, 887 insertions(+) create mode 100755 utils/addjson.py create mode 100755 utils/scp2json.py create mode 100644 utils/tokenizer.perl create mode 100755 utils/update_json.sh diff --git a/utils/addjson.py b/utils/addjson.py new file mode 100755 index 000000000..7fabe625e --- /dev/null +++ b/utils/addjson.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +# Copyright 2018 Nagoya University (Tomoki Hayashi) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import codecs +import json +import logging +import sys + +from distutils.util import strtobool + +from espnet.utils.cli_utils import get_commandline_args + +is_python2 = sys.version_info[0] == 2 + + +def get_parser(): + parser = argparse.ArgumentParser( + description="add multiple json values to an input or output value", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("jsons", type=str, nargs="+", help="json files") + parser.add_argument( + "-i", + "--is-input", + default=True, + type=strtobool, + help="If true, add to input. If false, add to output", + ) + parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + return parser + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + + # logging info + logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" + if args.verbose > 0: + logging.basicConfig(level=logging.INFO, format=logfmt) + else: + logging.basicConfig(level=logging.WARN, format=logfmt) + logging.info(get_commandline_args()) + + # make intersection set for utterance keys + js = [] + intersec_ks = [] + for x in args.jsons: + with codecs.open(x, "r", encoding="utf-8") as f: + j = json.load(f) + ks = j["utts"].keys() + logging.info(x + ": has " + str(len(ks)) + " utterances") + if len(intersec_ks) > 0: + intersec_ks = intersec_ks.intersection(set(ks)) + if len(intersec_ks) == 0: + logging.warning("Empty intersection") + break + else: + intersec_ks = set(ks) + js.append(j) + logging.info("new json has " + str(len(intersec_ks)) + " utterances") + + # updated original dict to keep intersection + intersec_org_dic = dict() + for k in intersec_ks: + v = js[0]["utts"][k] + intersec_org_dic[k] = v + + intersec_add_dic = dict() + for k in intersec_ks: + v = js[1]["utts"][k] + for j in js[2:]: + v.update(j["utts"][k]) + intersec_add_dic[k] = v + + new_dic = dict() + for key_id in intersec_org_dic: + orgdic = intersec_org_dic[key_id] + adddic = intersec_add_dic[key_id] + + if "utt2spk" not in orgdic: + orgdic["utt2spk"] = "" + # NOTE: for machine translation + + # add as input + if args.is_input: + # original input + input_list = orgdic["input"] + # additional input + in_add_dic = {} + if "idim" in adddic and "ilen" in adddic: + in_add_dic["shape"] = [int(adddic["ilen"]), int(adddic["idim"])] + elif "idim" in adddic: + in_add_dic["shape"] = [int(adddic["idim"])] + # add all other key value + for key, value in adddic.items(): + if key in ["idim", "ilen"]: + continue + in_add_dic[key] = value + # add name + in_add_dic["name"] = "input%d" % (len(input_list) + 1) + + input_list.append(in_add_dic) + new_dic[key_id] = { + "input": input_list, + "output": orgdic["output"], + "utt2spk": orgdic["utt2spk"], + } + # add as output + else: + # original output + output_list = orgdic["output"] + # additional output + out_add_dic = {} + # add shape + if "odim" in adddic and "olen" in adddic: + out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])] + elif "odim" in adddic: + out_add_dic["shape"] = [int(adddic["odim"])] + # add all other key value + for key, value in adddic.items(): + if key in ["odim", "olen"]: + continue + out_add_dic[key] = value + # add name + out_add_dic["name"] = "target%d" % (len(output_list) + 1) + + output_list.append(out_add_dic) + new_dic[key_id] = { + "input": orgdic["input"], + "output": output_list, + "utt2spk": orgdic["utt2spk"], + } + if "lang" in orgdic.keys(): + new_dic[key_id]["lang"] = orgdic["lang"] + + # ensure "ensure_ascii=False", which is a bug + jsonstring = json.dumps( + {"utts": new_dic}, + indent=4, + ensure_ascii=False, + sort_keys=True, + separators=(",", ": "), + ) + sys.stdout = codecs.getwriter("utf-8")( + sys.stdout if is_python2 else sys.stdout.buffer + ) + print(jsonstring) diff --git a/utils/scp2json.py b/utils/scp2json.py new file mode 100755 index 000000000..8e8de3e08 --- /dev/null +++ b/utils/scp2json.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +# Copyright 2017 Johns Hopkins University (Shinji Watanabe) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) +from __future__ import print_function +from __future__ import unicode_literals + +import argparse +import codecs +import json +import sys + +is_python2 = sys.version_info[0] == 2 + + +def get_parser(): + parser = argparse.ArgumentParser( + description="convert scp to json", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument("--key", "-k", type=str, help="key") + return parser + + +if __name__ == "__main__": + parser = get_parser() + args = parser.parse_args() + + new_line = {} + sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer) + sys.stdout = codecs.getwriter("utf-8")( + sys.stdout if is_python2 else sys.stdout.buffer + ) + line = sys.stdin.readline() + while line: + x = line.rstrip().split() + v = {args.key: " ".join(x[1:])} + new_line[x[0]] = v + line = sys.stdin.readline() + + all_l = {"utts": new_line} + + # ensure "ensure_ascii=False", which is a bug + jsonstring = json.dumps( + all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ") + ) + print(jsonstring) diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl new file mode 100644 index 000000000..ae97d6582 --- /dev/null +++ b/utils/tokenizer.perl @@ -0,0 +1,596 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; + +# Sample Tokenizer +### Version 1.1 +# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn +# Version 1.1 updates: +# (1) add multithreading option "-threads NUM_THREADS" (default is 1); +# (2) add a timing option "-time" to calculate the average speed of this tokenizer; +# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); +### Version 1.0 +# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ +# written by Josh Schroeder, based on code by Philipp Koehn + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use warnings; +use FindBin qw($RealBin); +use strict; +use Time::HiRes; + +if (eval {require Thread;1;}) { + #module loaded + Thread->import(); +} + +my $mydir = "$RealBin/../share/nonbreaking_prefixes"; + +my %NONBREAKING_PREFIX = (); +my @protected_patterns = (); +my $protected_patterns_file = ""; +my $language = "en"; +my $QUIET = 0; +my $HELP = 0; +my $AGGRESSIVE = 0; +my $SKIP_XML = 0; +my $TIMING = 0; +my $NUM_THREADS = 1; +my $NUM_SENTENCES_PER_THREAD = 2000; +my $PENN = 0; +my $NO_ESCAPING = 0; +while (@ARGV) +{ + $_ = shift; + /^-b$/ && ($| = 1, next); + /^-l$/ && ($language = shift, next); + /^-q$/ && ($QUIET = 1, next); + /^-h$/ && ($HELP = 1, next); + /^-x$/ && ($SKIP_XML = 1, next); + /^-a$/ && ($AGGRESSIVE = 1, next); + /^-time$/ && ($TIMING = 1, next); + # Option to add list of regexps to be protected + /^-protected/ && ($protected_patterns_file = shift, next); + /^-threads$/ && ($NUM_THREADS = int(shift), next); + /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); + /^-penn$/ && ($PENN = 1, next); + /^-no-escape/ && ($NO_ESCAPING = 1, next); +} + +# for time calculation +my $start_time; +if ($TIMING) +{ + $start_time = [ Time::HiRes::gettimeofday( ) ]; +} + +# print help message +if ($HELP) +{ + print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; + print "Options:\n"; + print " -q ... quiet.\n"; + print " -a ... aggressive hyphen splitting.\n"; + print " -b ... disable Perl buffering.\n"; + print " -time ... enable processing time calculation.\n"; + print " -penn ... use Penn treebank-like tokenization.\n"; + print " -protected FILE ... specify file with patters to be protected in tokenisation.\n"; + print " -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n"; + exit; +} + +if (!$QUIET) +{ + print STDERR "Tokenizer Version 1.1\n"; + print STDERR "Language: $language\n"; + print STDERR "Number of threads: $NUM_THREADS\n"; +} + +# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes +load_prefixes($language,\%NONBREAKING_PREFIX); + +if (scalar(%NONBREAKING_PREFIX) eq 0) +{ + print STDERR "Warning: No known abbreviations for language '$language'\n"; +} + +# Load protected patterns +if ($protected_patterns_file) +{ + open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file"; + while() { + chomp; + push @protected_patterns, $_; + } +} + +my @batch_sentences = (); +my @thread_list = (); +my $count_sentences = 0; + +if ($NUM_THREADS > 1) +{# multi-threading tokenization + while() + { + $count_sentences = $count_sentences + 1; + push(@batch_sentences, $_); + if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + # reset for the new run + @thread_list = (); + @batch_sentences = (); + } + } + # the last batch + if (scalar(@batch_sentences)>0) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + if ($start_index >= scalar(@batch_sentences)) + { + last; + } + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + if ($end_index >= scalar(@batch_sentences)) + { + $end_index = scalar(@batch_sentences)-1; + } + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + } +} +else +{# single thread only + while() + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + print $_; + } + else + { + print &tokenize($_); + } + } +} + +if ($TIMING) +{ + my $duration = Time::HiRes::tv_interval( $start_time ); + print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); + print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); +} + +##################################################################################### +# subroutines afterward + +# tokenize a batch of texts saved in an array +# input: an array containing a batch of texts +# return: another array containing a batch of tokenized texts for the input array +sub tokenize_batch +{ + my(@text_list) = @_; + my(@tokenized_list) = (); + foreach (@text_list) + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + push(@tokenized_list, $_); + } + else + { + push(@tokenized_list, &tokenize($_)); + } + } + return \@tokenized_list; +} + +# the actual tokenize function which tokenizes one input string +# input: one string +# return: the tokenized string for the input string +sub tokenize +{ + my($text) = @_; + + if ($PENN) { + return tokenize_penn($text); + } + + chomp($text); + $text = " $text "; + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # Find protected patterns + my @protected = (); + foreach my $protected_pattern (@protected_patterns) { + my $t = $text; + while ($t =~ /(?$protected_pattern)(?.*)$/) { + push @protected, $+{PATTERN}; + $t = $+{TAIL}; + } + } + + for (my $i = 0; $i < scalar(@protected); ++$i) { + my $subst = sprintf("THISISPROTECTED%.3d", $i); + $text =~ s,\Q$protected[$i], $subst ,g; + } + $text =~ s/ +/ /g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + # separate out all "other" special characters + if (($language eq "fi") or ($language eq "sv")) { + # in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character: + # USA:n, 20:een, EU:ssa, USA:s, S:t + $text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g; + # if a colon is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g; + } + elsif ($language eq "tdt") { + # in Tetun, the apostrophe can be used inside words as an apostrophe-like character: + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + # if an apostrophe is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(\')(?=$|[^\p{Ll}])/ $1 /g; + } + elsif (($language eq "ca")) { + # in Catalan, the middle dot can be used inside words: + # il�lusio + $text =~ s/([^\p{IsAlnum}\s\.\·\'\`\,\-])/ $1 /g; + # if a middot is not immediately followed by lower-case characters, separate it out anyway + $text =~ s/(·)(?=$|[^\p{Ll}])/ $1 /g; + } + else { + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + } + + # aggressive hyphen splitting + if ($AGGRESSIVE) + { + $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g; + } + + #multi-dots stay together + $text =~ s/\.([\.]+)/ DOTMULTI$1/g; + while($text =~ /DOTMULTI\./) + { + $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; + $text =~ s/DOTMULTI\./DOTDOTMULTI/g; + } + + # seperate out "," except if within numbers (5,300) + #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + + # separate out "," except if within numbers (5,300) + # previous "global" application skips some: A,B,C,D,E > A , B,C , D,E + # first application uses up B so rule can't see B,C + # two-step version here may create extra spaces but these are removed later + # will also space digit,letter or letter,digit forms (redundant with next section) + $text =~ s/([^\p{IsN}])[,]/$1 , /g; + $text =~ s/[,]([^\p{IsN}])/ , $1/g; + + # separate "," after a number if it's the end of a sentence + $text =~ s/([\p{IsN}])[,]$/$1 ,/g; + + # separate , pre and post number + #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + # turn `into ' + #$text =~ s/\`/\'/g; + + #turn '' into " + #$text =~ s/\'\'/ \" /g; + + if ($language eq "en") + { + #split contractions right + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; + #special case for "1990's" + $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; + } + elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga") or ($language eq "ca")) + { + #split contractions left + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; + } + elsif (($language eq "so") or ($language eq "tdt")) + { + # Don't split glottals + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + } + else + { + $text =~ s/\'/ \' /g; + } + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if ($i == scalar(@words)-1) { + # split last words independently as they are unlikely to be non-breaking prefixes + $word = $pre." ."; + } + elsif (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\"/\"/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + } + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + +sub tokenize_penn +{ + # Improved compatibility with Penn Treebank tokenization. Useful if + # the text is to later be parsed with a PTB-trained parser. + # + # Adapted from Robert MacIntyre's sed script: + # http://www.cis.upenn.edu/~treebank/tokenizer.sed + + my($text) = @_; + chomp($text); + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # attempt to get correct directional quotes + $text =~ s/^``/`` /g; + $text =~ s/^"/`` /g; + $text =~ s/^`([^`])/` $1/g; + $text =~ s/^'/` /g; + $text =~ s/([ ([{<])"/$1 `` /g; + $text =~ s/([ ([{<])``/$1 `` /g; + $text =~ s/([ ([{<])`([^`])/$1 ` $2/g; + $text =~ s/([ ([{<])'/$1 ` /g; + # close quotes handled at end + + $text =~ s=\.\.\.= _ELLIPSIS_ =g; + + # separate out "," except if within numbers (5,300) + $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + # separate , pre and post number + $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g; +$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g; + + # Separate out intra-token slashes. PTB tokenization doesn't do this, so + # the tokens should be merged prior to parsing with a PTB-trained parser + # (see syntax-hyphen-splitting.perl). + $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g; + + # Assume sentence tokenization has been done first, so split FINAL periods + # only. + $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g; + # however, we may as well split ALL question marks and exclamation points, + # since they shouldn't have the abbrev.-marker ambiguity problem + $text =~ s=([?!])= $1 =g; + + # parentheses, brackets, etc. + $text =~ s=([\]\[\(\){}<>])= $1 =g; + $text =~ s/\(/-LRB-/g; + $text =~ s/\)/-RRB-/g; + $text =~ s/\[/-LSB-/g; + $text =~ s/\]/-RSB-/g; + $text =~ s/{/-LCB-/g; + $text =~ s/}/-RCB-/g; + + $text =~ s=--= -- =g; + + # First off, add a space to the beginning and end of each line, to reduce + # necessary number of regexps. + $text =~ s=$= =; + $text =~ s=^= =; + + $text =~ s="= '' =g; + # possessive or close-single-quote + $text =~ s=([^'])' =$1 ' =g; + # as in it's, I'm, we'd + $text =~ s='([sSmMdD]) = '$1 =g; + $text =~ s='ll = 'll =g; + $text =~ s='re = 're =g; + $text =~ s='ve = 've =g; + $text =~ s=n't = n't =g; + $text =~ s='LL = 'LL =g; + $text =~ s='RE = 'RE =g; + $text =~ s='VE = 'VE =g; + $text =~ s=N'T = N'T =g; + + $text =~ s= ([Cc])annot = $1an not =g; + $text =~ s= ([Dd])'ye = $1' ye =g; + $text =~ s= ([Gg])imme = $1im me =g; + $text =~ s= ([Gg])onna = $1on na =g; + $text =~ s= ([Gg])otta = $1ot ta =g; + $text =~ s= ([Ll])emme = $1em me =g; + $text =~ s= ([Mm])ore'n = $1ore 'n =g; + $text =~ s= '([Tt])is = '$1 is =g; + $text =~ s= '([Tt])was = '$1 was =g; + $text =~ s= ([Ww])anna = $1an na =g; + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\"/\"/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + +sub load_prefixes +{ + my ($language, $PREFIX_REF) = @_; + + my $prefixfile = "$mydir/nonbreaking_prefix.$language"; + + #default back to English if we don't have a language-specific prefix file + if (!(-e $prefixfile)) + { + $prefixfile = "$mydir/nonbreaking_prefix.en"; + print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; + die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); + } + + if (-e "$prefixfile") + { + open(PREFIX, "<:utf8", "$prefixfile"); + while () + { + my $item = $_; + chomp($item); + if (($item) && (substr($item,0,1) ne "#")) + { + if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) + { + $PREFIX_REF->{$1} = 2; + } + else + { + $PREFIX_REF->{$item} = 1; + } + } + } + close(PREFIX); + } +} \ No newline at end of file diff --git a/utils/update_json.sh b/utils/update_json.sh new file mode 100755 index 000000000..bf6974755 --- /dev/null +++ b/utils/update_json.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +# Copyright 2020 Kyoto University (Hirofumi Inaguma) +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +echo "$0 $*" >&2 # Print the command line for logging +. ./path.sh + +nlsyms="" +oov="" +bpecode="" +verbose=0 + +text="" +multilingual=false + +help_message=$(cat << EOF +Usage: $0 +e.g. $0 data/train data/lang_1char/train_units.txt +Options: + --oov # Default: + --verbose # Default: 0 +EOF +) +. utils/parse_options.sh + +if [ $# != 3 ]; then + echo "${help_message}" 1>&2 + exit 1; +fi + +set -euo pipefail + +json=$1 +dir=$2 +dic=$3 +json_dir=$(dirname ${json}) +tmpdir=$(mktemp -d ${dir}/tmp-XXXXX) +trap 'rm -rf ${tmpdir}' EXIT + +if [ -z ${text} ]; then + text=${dir}/text +fi + +# 2. Create scp files for outputs +mkdir -p ${tmpdir}/output +if [ -n "${bpecode}" ]; then + if [ ${multilingual} = true ]; then + # remove a space before the language ID + paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \ + | spm_encode --model=${bpecode} --output_format=piece | cut -f 2- -d" ") \ + > ${tmpdir}/output/token.scp + else + paste -d " " <(awk '{print $1}' ${text}) <(cut -f 2- -d" " ${text} \ + | spm_encode --model=${bpecode} --output_format=piece) \ + > ${tmpdir}/output/token.scp + fi +elif [ -n "${nlsyms}" ]; then + text2token.py -s 1 -n 1 -l ${nlsyms} ${text} > ${tmpdir}/output/token.scp +else + text2token.py -s 1 -n 1 ${text} > ${tmpdir}/output/token.scp +fi +< ${tmpdir}/output/token.scp utils/sym2int.pl --map-oov ${oov} -f 2- ${dic} > ${tmpdir}/output/tokenid.scp +awk '{print $1 " " NF-1}' ${tmpdir}/output/tokenid.scp > ${tmpdir}/output/olen.scp +# +2 comes from CTC blank and EOS +vocsize=$(tail -n 1 ${dic} | awk '{print $2}') +odim=$(echo "$vocsize + 2" | bc) +awk -v odim=${odim} '{print $1 " " odim}' ${text} > ${tmpdir}/output/odim.scp + +cat ${text} > ${tmpdir}/output/text.scp + + +# 4. Create JSON files from each scp files +rm -f ${tmpdir}/*/*.json +for x in "${tmpdir}"/output/*.scp; do + k=$(basename ${x} .scp) + < ${x} scp2json.py --key ${k} > ${tmpdir}/output/${k}.json +done + +# add to json +addjson.py --verbose ${verbose} -i false \ + ${json} ${tmpdir}/output/text.json ${tmpdir}/output/token.json ${tmpdir}/output/tokenid.json ${tmpdir}/output/olen.json ${tmpdir}/output/odim.json > ${tmpdir}/data.json +mkdir -p ${json_dir}/.backup +echo "json updated. original json is kept in ${json_dir}/.backup." +cp ${json} ${json_dir}/.backup/"$(basename ${json})" +cp ${tmpdir}/data.json ${json} + +rm -fr ${tmpdir} From 48207c14107a0de7d0c54d008220b1be832ba615 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 11:34:21 -0800 Subject: [PATCH 07/53] process scripts and configs --- examples/ted_en_zh/st1/conf/fbank.conf | 2 + examples/ted_en_zh/st1/conf/pitch.conf | 1 + examples/ted_en_zh/st1/local/ted_en_zh.py | 104 ++++++++++++++++++++++ examples/ted_en_zh/st1/steps | 1 + examples/ted_en_zh/st1/utils | 1 + 5 files changed, 109 insertions(+) create mode 100644 examples/ted_en_zh/st1/conf/fbank.conf create mode 100644 examples/ted_en_zh/st1/conf/pitch.conf create mode 100644 examples/ted_en_zh/st1/local/ted_en_zh.py create mode 120000 examples/ted_en_zh/st1/steps create mode 120000 examples/ted_en_zh/st1/utils diff --git a/examples/ted_en_zh/st1/conf/fbank.conf b/examples/ted_en_zh/st1/conf/fbank.conf new file mode 100644 index 000000000..82ac7bd0d --- /dev/null +++ b/examples/ted_en_zh/st1/conf/fbank.conf @@ -0,0 +1,2 @@ +--sample-frequency=16000 +--num-mel-bins=80 diff --git a/examples/ted_en_zh/st1/conf/pitch.conf b/examples/ted_en_zh/st1/conf/pitch.conf new file mode 100644 index 000000000..e959a19d5 --- /dev/null +++ b/examples/ted_en_zh/st1/conf/pitch.conf @@ -0,0 +1 @@ +--sample-frequency=16000 diff --git a/examples/ted_en_zh/st1/local/ted_en_zh.py b/examples/ted_en_zh/st1/local/ted_en_zh.py new file mode 100644 index 000000000..f30573b7e --- /dev/null +++ b/examples/ted_en_zh/st1/local/ted_en_zh.py @@ -0,0 +1,104 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import codecs +import os + + +# org_split = 'train-split/train-segment' +# text_file = 'En-Zh/train.en-zh' +# data_split = 'train' +def data_process(src_dir, tgt_dir, wav_dir_list, text_file_list, + data_split_list): + + for org_split, text_file, data_split in zip(wav_dir_list, text_file_list, + data_split_list): + local_data_split_dir = os.path.join(tgt_dir, data_split) + + os.makedirs(local_data_split_dir, exist_ok=True) + utts = [] + utt2spk = {} + with open(os.path.join(local_data_split_dir, 'wav.scp.org'), 'w') as wav_wf, \ + open(os.path.join(local_data_split_dir, 'utt2spk.org'), 'w') as utt2spk_wf: + for files in os.listdir(os.path.join(src_dir, org_split)): + files = files.strip() + file_path = os.path.join(src_dir, org_split, files) + size = os.path.getsize(file_path) + if size <= 30000: + continue + utt = files.split('.')[0] + audio_name = utt.split('_')[0] + #format the name of utterance + while len(audio_name) < 6: + utt = '0' + utt + audio_name = '0' + audio_name + utt = 'ted-en-zh-' + utt + utts.append(utt) + spk = utt.split('_')[0] + utt2spk[utt] = spk + assert len(spk) == 16, "%r" % spk + print(utt, 'cat', os.path.abspath(file_path), '|', file=wav_wf) + for utt in sorted(utts): + print(utt, utt2spk[utt], file=utt2spk_wf) + + with open(os.path.join(local_data_split_dir, 'en.org'), 'w') as en_wf, \ + open(os.path.join(local_data_split_dir, 'zh.org'), 'w') as zh_wf, \ + open(os.path.join(local_data_split_dir, '.yaml'), 'w') as yaml_wf, \ + codecs.open(os.path.join(src_dir, text_file), 'r', encoding='utf-8', + errors='ignore') as rf: + count = 0 + for line in rf: + line = line.strip() + line_spl = line.split('\t') + assert len(line_spl) == 3, "%r" % line + wav, en, zh = line_spl + assert wav.endswith('wav'), "%r" % wav[-3:] + utt = wav.split('.')[0] + audio_name = utt.split('_')[0] + while len(audio_name) < 6: + utt = '0' + utt + audio_name = '0' + audio_name + utt = 'ted-en-zh-' + utt + print(utt, file=yaml_wf) + print(en.lower(), file=en_wf) + print(zh, file=zh_wf) + count += 1 + print('%s set lines count: %d' % (data_split, count)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description=__doc__) + + parser.add_argument( + "--src-dir", + default="", + type=str, + help="Directory to kaldi splited data. (default: %(default)s)") + parser.add_argument( + "--tgt-dir", + default="local/ted_en_zh", + type=str, + help="Directory to save processed data. (default: %(default)s)") + args = parser.parse_args() + + wav_dir_list = [ + 'train-split/train-segment', 'test-segment/tst2014', + 'test-segment/tst2015' + ] + text_file_list = [ + 'En-Zh/train.en-zh', 'En-Zh/tst2014.en-zh', 'En-Zh/tst2015.en-zh' + ] + data_split_list = ['train', 'dev', 'test'] + data_process(args.src_dir, args.tgt_dir, wav_dir_list, text_file_list, + data_split_list) diff --git a/examples/ted_en_zh/st1/steps b/examples/ted_en_zh/st1/steps new file mode 120000 index 000000000..91f2d234e --- /dev/null +++ b/examples/ted_en_zh/st1/steps @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/steps \ No newline at end of file diff --git a/examples/ted_en_zh/st1/utils b/examples/ted_en_zh/st1/utils new file mode 120000 index 000000000..f49247da8 --- /dev/null +++ b/examples/ted_en_zh/st1/utils @@ -0,0 +1 @@ +../../../tools/kaldi/egs/wsj/s5/utils \ No newline at end of file From e867f3bb416a0c7b8349995ad2ff3f2c97fc6b4e Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 14:02:05 -0800 Subject: [PATCH 08/53] minor --- examples/ted_en_zh/st1/local/data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index 8b829a8a1..c61c9a9fc 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -2,7 +2,7 @@ set -e -stage=3 +stage=-1 stop_stage=100 dict_dir=data/lang_char From d2fab3238b7082ee5a5df33d6725514cf4cceb05 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 16:57:36 -0800 Subject: [PATCH 09/53] fix bugs --- paddlespeech/s2t/frontend/utility.py | 8 ++++---- paddlespeech/s2t/io/sampler.py | 2 +- paddlespeech/s2t/utils/checkpoint.py | 3 +++ 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/paddlespeech/s2t/frontend/utility.py b/paddlespeech/s2t/frontend/utility.py index 703f2127d..d423a6044 100644 --- a/paddlespeech/s2t/frontend/utility.py +++ b/paddlespeech/s2t/frontend/utility.py @@ -102,10 +102,10 @@ def read_manifest( manifest = [] with jsonlines.open(manifest_path, 'r') as reader: for json_data in reader: - feat_len = json_data["feat_shape"][ - 0] if 'feat_shape' in json_data else 1.0 - token_len = json_data["token_shape"][ - 0] if 'token_shape' in json_data else 1.0 + feat_len = json_data["input"][0]["shape"][ + 0] if 'shape' in json_data["input"][0] else 1.0 + token_len = json_data["output"][0]["shape"][ + 0] if 'shape' in json_data["output"][0] else 1.0 conditions = [ feat_len >= min_input_len, feat_len <= max_input_len, diff --git a/paddlespeech/s2t/io/sampler.py b/paddlespeech/s2t/io/sampler.py index 35b57524b..0d5a16ce1 100644 --- a/paddlespeech/s2t/io/sampler.py +++ b/paddlespeech/s2t/io/sampler.py @@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False): """ rng = np.random.RandomState(epoch) shift_len = rng.randint(0, batch_size - 1) - batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) + batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size)) rng.shuffle(batch_indices) batch_indices = [item for batch in batch_indices for item in batch] assert clipped is False diff --git a/paddlespeech/s2t/utils/checkpoint.py b/paddlespeech/s2t/utils/checkpoint.py index 5105f95ef..4c493715a 100644 --- a/paddlespeech/s2t/utils/checkpoint.py +++ b/paddlespeech/s2t/utils/checkpoint.py @@ -94,6 +94,9 @@ class Checkpoint(): """ configs = {} + if len(checkpoint_path) == 0 or checkpoint_path == "None": + checkpoint_path = None + if checkpoint_path is not None: pass elif checkpoint_dir is not None and record_file is not None: From 3c8e87344a4ce38617adabd44f3496157e9e80e8 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 16:59:50 -0800 Subject: [PATCH 10/53] update run scripts --- .../st1/conf/transformer_mtl_noam.yaml | 4 +- examples/ted_en_zh/st1/local/data.sh | 2 +- .../ted_en_zh/st1/local/train_finetune.sh | 39 ------------------- examples/ted_en_zh/st1/run.sh | 17 ++++---- 4 files changed, 11 insertions(+), 51 deletions(-) delete mode 100755 examples/ted_en_zh/st1/local/train_finetune.sh diff --git a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml index b4fb51075..3175aad9f 100644 --- a/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml +++ b/examples/ted_en_zh/st1/conf/transformer_mtl_noam.yaml @@ -11,9 +11,9 @@ data: max_output_input_ratio: 20.0 collator: - vocab_filepath: data/lang_char/vocab.txt + vocab_filepath: data/lang_char/ted_en_zh_bpe8000.txt unit_type: 'spm' - spm_model_prefix: data/train_sp.en-zh-nlpr.zh-nlpr_bpe8000_tc + spm_model_prefix: data/lang_char/ted_en_zh_bpe8000 mean_std_filepath: "" # augmentation_config: conf/augmentation.json batch_size: 10 diff --git a/examples/ted_en_zh/st1/local/data.sh b/examples/ted_en_zh/st1/local/data.sh index c61c9a9fc..f9c876b16 100755 --- a/examples/ted_en_zh/st1/local/data.sh +++ b/examples/ted_en_zh/st1/local/data.sh @@ -166,7 +166,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ${prep_dir}/test.en-zh.zh/feats.scp ${prep_dir}/train_sp.en-zh.zh/cmvn.ark ${prep_dir}/dump_feats/test.en-zh.zh ${feat_trans_dir} fi -dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}_joint.txt +dict=${dict_dir}/ted_en_zh_${bpemode}${nbpe}.txt nlsyms=${dict_dir}/ted_en_zh_non_lang_syms.txt bpemodel=${dict_dir}/ted_en_zh_${bpemode}${nbpe} if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/examples/ted_en_zh/st1/local/train_finetune.sh b/examples/ted_en_zh/st1/local/train_finetune.sh deleted file mode 100755 index e54c7fff4..000000000 --- a/examples/ted_en_zh/st1/local/train_finetune.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -if [ $# != 3 ];then - echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path" - exit -1 -fi - -ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') -echo "using $ngpu gpus..." - -config_path=$1 -ckpt_name=$2 -ckpt_path=$3 - -mkdir -p exp - -# seed may break model convergence -seed=0 -if [ ${seed} != 0 ]; then - export FLAGS_cudnn_deterministic=True -fi - -python3 -u ${BIN_DIR}/train.py \ ---ngpu ${ngpu} \ ---config ${config_path} \ ---output exp/${ckpt_name} \ ---checkpoint_path ${ckpt_path} \ ---seed ${seed} - -if [ ${seed} != 0 ]; then - unset FLAGS_cudnn_deterministic -fi - -if [ $? -ne 0 ]; then - echo "Failed in training!" - exit 1 -fi - -exit 0 \ No newline at end of file diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index f8adf4f65..a1c99af30 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -6,7 +6,7 @@ gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml -ckpt_path=paddle.98 +ckpt= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; @@ -22,21 +22,20 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # download pretrained - bash ./local/download_pretrain.sh || exit -1 -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=${gpus} ./local/train_finetune.sh ${conf_path} ${ckpt} ${ckpt_path} + if [ -n "${ckpt_path}" ]; then + echo "Finetune from Pretrained Model" ${ckpt_path} + ./local/download_pretrain.sh || exit -1 + fi + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" fi -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # avg n best model avg.sh best exp/${ckpt}/checkpoints ${avg_num} fi -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # test ckpt avg_n CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi \ No newline at end of file From 0cc81a52cdae3783972c4fa25d8de33784fb7f97 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 17:00:09 -0800 Subject: [PATCH 11/53] update format --- utils/addjson.py | 27 ++++++++++++--------------- utils/scp2json.py | 19 ++++++++++--------- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/utils/addjson.py b/utils/addjson.py index 7fabe625e..013d14727 100755 --- a/utils/addjson.py +++ b/utils/addjson.py @@ -1,9 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 - # Copyright 2018 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - from __future__ import print_function from __future__ import unicode_literals @@ -12,7 +10,6 @@ import codecs import json import logging import sys - from distutils.util import strtobool from espnet.utils.cli_utils import get_commandline_args @@ -23,17 +20,16 @@ is_python2 = sys.version_info[0] == 2 def get_parser(): parser = argparse.ArgumentParser( description="add multiple json values to an input or output value", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("jsons", type=str, nargs="+", help="json files") parser.add_argument( "-i", "--is-input", default=True, type=strtobool, - help="If true, add to input. If false, add to output", - ) - parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") + help="If true, add to input. If false, add to output", ) + parser.add_argument( + "--verbose", "-V", default=0, type=int, help="Verbose option") return parser @@ -121,7 +117,8 @@ if __name__ == "__main__": out_add_dic = {} # add shape if "odim" in adddic and "olen" in adddic: - out_add_dic["shape"] = [int(adddic["olen"]), int(adddic["odim"])] + out_add_dic[ + "shape"] = [int(adddic["olen"]), int(adddic["odim"])] elif "odim" in adddic: out_add_dic["shape"] = [int(adddic["odim"])] # add all other key value @@ -143,13 +140,13 @@ if __name__ == "__main__": # ensure "ensure_ascii=False", which is a bug jsonstring = json.dumps( - {"utts": new_dic}, + { + "utts": new_dic + }, indent=4, ensure_ascii=False, sort_keys=True, - separators=(",", ": "), - ) - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer - ) + separators=(",", ": "), ) + sys.stdout = codecs.getwriter("utf-8")(sys.stdout + if is_python2 else sys.stdout.buffer) print(jsonstring) diff --git a/utils/scp2json.py b/utils/scp2json.py index 8e8de3e08..e2a757665 100755 --- a/utils/scp2json.py +++ b/utils/scp2json.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 # encoding: utf-8 - # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) from __future__ import print_function @@ -17,8 +16,7 @@ is_python2 = sys.version_info[0] == 2 def get_parser(): parser = argparse.ArgumentParser( description="convert scp to json", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) + formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument("--key", "-k", type=str, help="key") return parser @@ -28,10 +26,10 @@ if __name__ == "__main__": args = parser.parse_args() new_line = {} - sys.stdin = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer) - sys.stdout = codecs.getwriter("utf-8")( - sys.stdout if is_python2 else sys.stdout.buffer - ) + sys.stdin = codecs.getreader("utf-8")(sys.stdin + if is_python2 else sys.stdin.buffer) + sys.stdout = codecs.getwriter("utf-8")(sys.stdout + if is_python2 else sys.stdout.buffer) line = sys.stdin.readline() while line: x = line.rstrip().split() @@ -43,6 +41,9 @@ if __name__ == "__main__": # ensure "ensure_ascii=False", which is a bug jsonstring = json.dumps( - all_l, indent=4, ensure_ascii=False, sort_keys=True, separators=(",", ": ") - ) + all_l, + indent=4, + ensure_ascii=False, + sort_keys=True, + separators=(",", ": ")) print(jsonstring) From 351e4e8e87e1b5b678c4aded167cb735327da4ee Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 17:01:16 -0800 Subject: [PATCH 12/53] training script --- examples/ted_en_zh/st1/local/train.sh | 39 +++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100755 examples/ted_en_zh/st1/local/train.sh diff --git a/examples/ted_en_zh/st1/local/train.sh b/examples/ted_en_zh/st1/local/train.sh new file mode 100755 index 000000000..a8e4acaa0 --- /dev/null +++ b/examples/ted_en_zh/st1/local/train.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +if [ $# != 3 ];then + echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path" + exit -1 +fi + +ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +echo "using $ngpu gpus..." + +config_path=$1 +ckpt_name=$2 +ckpt_path=$3 + +mkdir -p exp + +# seed may break model convergence +seed=0 +if [ ${seed} != 0 ]; then + export FLAGS_cudnn_deterministic=True +fi + +python3 -u ${BIN_DIR}/train.py \ +--ngpu ${ngpu} \ +--config ${config_path} \ +--output exp/${ckpt_name} \ +--checkpoint_path "${ckpt_path}" \ +--seed ${seed} + +if [ ${seed} != 0 ]; then + unset FLAGS_cudnn_deterministic +fi + +if [ $? -ne 0 ]; then + echo "Failed in training!" + exit 1 +fi + +exit 0 \ No newline at end of file From 79060e20e3d5f6285f49b503f95b8db9ddce9294 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 30 Nov 2021 10:34:43 +0800 Subject: [PATCH 13/53] Update pack_model.sh --- utils/pack_model.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/pack_model.sh b/utils/pack_model.sh index 8acd59a64..d7df01eb7 100755 --- a/utils/pack_model.sh +++ b/utils/pack_model.sh @@ -57,7 +57,7 @@ else echo "missing ${dec_conf}" exit 1 fi -# NOTE(kan-bayashi): preprocess conf is optional +# preprocess conf is optional if [ -n "${preprocess_conf}" ]; then tar rfh ${outfile}.tar ${preprocess_conf} echo -n " - preprocess config file: \`" From 507c3b52eab46beb411314e90e2928042abd1065 Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 30 Nov 2021 10:35:16 +0800 Subject: [PATCH 14/53] Update default.yaml --- examples/csmsc/voc3/conf/default.yaml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/csmsc/voc3/conf/default.yaml b/examples/csmsc/voc3/conf/default.yaml index cc27220fc..5dda835ae 100644 --- a/examples/csmsc/voc3/conf/default.yaml +++ b/examples/csmsc/voc3/conf/default.yaml @@ -6,8 +6,7 @@ # This configuration is based on full-band MelGAN but the hop size and sampling # rate is different from the paper (16kHz vs 24kHz). The number of iteraions # is not shown in the paper so currently we train 1M iterations (not sure enough -# to converge). The optimizer setting is based on @dathudeptrai advice. -# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906 +# to converge). ########################################################### # FEATURE EXTRACTION SETTING # @@ -136,4 +135,4 @@ eval_interval_steps: 1000 # Interval steps to evaluate the network # OTHER SETTING # ########################################################### num_snapshots: 10 # max number of snapshots to keep while training -seed: 42 # random seed for paddle, random, and np.random \ No newline at end of file +seed: 42 # random seed for paddle, random, and np.random From 2de7bc14b085f9b835a4acdc350475405a310ecf Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 30 Nov 2021 10:35:29 +0800 Subject: [PATCH 15/53] Update finetune.yaml --- examples/csmsc/voc3/conf/finetune.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/csmsc/voc3/conf/finetune.yaml b/examples/csmsc/voc3/conf/finetune.yaml index 80ab6bed7..302274019 100644 --- a/examples/csmsc/voc3/conf/finetune.yaml +++ b/examples/csmsc/voc3/conf/finetune.yaml @@ -6,8 +6,7 @@ # This configuration is based on full-band MelGAN but the hop size and sampling # rate is different from the paper (16kHz vs 24kHz). The number of iteraions # is not shown in the paper so currently we train 1M iterations (not sure enough -# to converge). The optimizer setting is based on @dathudeptrai advice. -# https://github.com/kan-bayashi/ParallelWaveGAN/issues/143#issuecomment-632539906 +# to converge). ########################################################### # FEATURE EXTRACTION SETTING # From c94ebdc52cdcf52b9e400fe2090efc953f895b4e Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Tue, 30 Nov 2021 14:22:32 +0800 Subject: [PATCH 16/53] Add python api for executor. --- paddlespeech/cli/executor.py | 15 +++++++++++++++ paddlespeech/cli/s2t/infer.py | 19 +++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index 2261e011b..2314bd6d3 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -14,6 +14,7 @@ import os from abc import ABC from abc import abstractmethod +from typing import List from typing import Union import paddle @@ -64,3 +65,17 @@ class BaseExecutor(ABC): Output postprocess and return human-readable results such as texts and audio files. """ pass + + @abstractmethod + def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ + pass + + @abstractmethod + def __call__(self, *arg, **kwargs): + """ + Python API to call an executor. + """ + pass diff --git a/paddlespeech/cli/s2t/infer.py b/paddlespeech/cli/s2t/infer.py index 6aa29addf..9509e311c 100644 --- a/paddlespeech/cli/s2t/infer.py +++ b/paddlespeech/cli/s2t/infer.py @@ -126,6 +126,9 @@ class S2TExecutor(BaseExecutor): pass def execute(self, argv: List[str]) -> bool: + """ + Command line entry. + """ parser_args = self.parser.parse_args(argv) print(parser_args) @@ -137,12 +140,20 @@ class S2TExecutor(BaseExecutor): device = parser_args.device try: - self._init_from_path(model, lang, config, ckpt_path) - self.preprocess(audio_file) - self.infer() - res = self.postprocess() # Retrieve result of s2t. + res = self(model, lang, config, ckpt_path, audio_file, device) print(res) return True except Exception as e: print(e) return False + + def __call__(self, model, lang, config, ckpt_path, audio_file, device): + """ + Python API to call an executor. + """ + self._init_from_path(model, lang, config, ckpt_path) + self.preprocess(audio_file) + self.infer() + res = self.postprocess() # Retrieve result of s2t. + + return res From f225b1d88ecdf92b758e999e28ff4e6d433d95f6 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 22:39:07 -0800 Subject: [PATCH 17/53] minor updates --- examples/ted_en_zh/st1/local/data_prep.sh | 54 ----------------------- examples/ted_en_zh/st1/path.sh | 3 +- examples/ted_en_zh/st1/run.sh | 5 ++- 3 files changed, 4 insertions(+), 58 deletions(-) delete mode 100755 examples/ted_en_zh/st1/local/data_prep.sh diff --git a/examples/ted_en_zh/st1/local/data_prep.sh b/examples/ted_en_zh/st1/local/data_prep.sh deleted file mode 100755 index 339cee1eb..000000000 --- a/examples/ted_en_zh/st1/local/data_prep.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Kyoto University (Hirofumi Inaguma) -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -export LC_ALL=C - -data_dir=${1} - -for set in train dev test; do -# for set in train; do - dst=${target_dir}/${set} - for lang in en zh; do - - if [ ${lang} = 'en' ]; then - echo "remove punctuation $lang" - # remove punctuation - local/remove_punctuation.pl < ${dst}/${lang}.org > ${dst}/${lang}.raw - else - cp ${dst}/${lang}.org ${dst}/${lang}.raw - fi - - paste -d " " ${dst}/.yaml ${dst}/${lang}.raw | sort > ${dst}/text.${lang} - - - done - # error check - n=$(cat ${dst}/.yaml | wc -l) - n_en=$(cat ${dst}/en.raw | wc -l) - n_tgt=$(cat ${dst}/zh.raw | wc -l) - [ ${n} -ne ${n_en} ] && echo "Warning: expected ${n} data data files, found ${n_en}" && exit 1; - [ ${n} -ne ${n_tgt} ] && echo "Warning: expected ${n} data data files, found ${n_tgt}" && exit 1; - - echo "done text processing" - cat ${dst}/wav.scp.org | uniq | sort -k1,1 -u > ${dst}/wav.scp - cat ${dst}/utt2spk.org | uniq | sort -k1,1 -u > ${dst}/utt2spk - - cat ${dst}/utt2spk | utt2spk_to_spk2utt.pl | sort -k1,1 -u > ${dst}/spk2utt - rm -rf ${target_dir}/data_prep/${set}.en-zh - mkdir -p ${target_dir}/data_prep/${set}.en-zh - echo "remove duplicate lines..." - cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep -v '1 ted-en-zh' \ - | sed 's/^[ \t]*//' > ${dst}/duplicate_lines - cut -d ' ' -f 1 ${dst}/text.en | sort | uniq -c | sort -n -k1 -r | grep '1 ted-en-zh' \ - | cut -d '1' -f 2- | sed 's/^[ \t]*//' > ${dst}/reclist - reduce_data_dir.sh ${dst} ${dst}/reclist ${target_dir}/data_prep/${set}.en-zh - echo "done wav processing" - for l in en zh; do - cp ${dst}/text.${l} ${target_dir}/data_prep/${set}.en-zh/text.${l} - done - fix_data_dir.sh --utt_extra_files \ - "text.en text.zh" \ - ${target_dir}/data_prep/${set}.en-zh -done \ No newline at end of file diff --git a/examples/ted_en_zh/st1/path.sh b/examples/ted_en_zh/st1/path.sh index ee4c9779f..867cdb48a 100644 --- a/examples/ted_en_zh/st1/path.sh +++ b/examples/ted_en_zh/st1/path.sh @@ -19,5 +19,4 @@ export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi [ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" -[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh -export train_cmd="run.pl" \ No newline at end of file +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh \ No newline at end of file diff --git a/examples/ted_en_zh/st1/run.sh b/examples/ted_en_zh/st1/run.sh index a1c99af30..f6362a8b3 100755 --- a/examples/ted_en_zh/st1/run.sh +++ b/examples/ted_en_zh/st1/run.sh @@ -1,12 +1,13 @@ #!/bin/bash set -e -source path.sh +. ./path.sh || exit 1; +. ./cmd.sh || exit 1; gpus=0,1,2,3 stage=1 stop_stage=4 conf_path=conf/transformer_mtl_noam.yaml -ckpt= # paddle.98 # (finetune from FAT-ST pretrained model) +ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model) avg_num=5 data_path=./TED_EnZh # path to unzipped data source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; From aea1e92a3df7bad912f70ad84d953f02a43b8471 Mon Sep 17 00:00:00 2001 From: Junkun Date: Mon, 29 Nov 2021 22:50:34 -0800 Subject: [PATCH 18/53] update cmd.sh --- examples/ted_en_zh/st1/cmd.sh | 89 +++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 examples/ted_en_zh/st1/cmd.sh diff --git a/examples/ted_en_zh/st1/cmd.sh b/examples/ted_en_zh/st1/cmd.sh new file mode 100644 index 000000000..7b70ef5e0 --- /dev/null +++ b/examples/ted_en_zh/st1/cmd.sh @@ -0,0 +1,89 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time