From 02c7ef319898e33650aad90759c98712d3777cad Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 10:30:54 +0000 Subject: [PATCH] format data support multi output --- examples/ted_en_zh/t0/run.sh | 3 ++- utils/format_data.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/t0/run.sh index e9f4a058..654d4dce 100755 --- a/examples/ted_en_zh/t0/run.sh +++ b/examples/ted_en_zh/t0/run.sh @@ -2,6 +2,7 @@ set -e source path.sh +gpus=0,1,2,3 stage=0 stop_stage=100 conf_path=conf/transformer_joint_noam.yaml @@ -21,7 +22,7 @@ fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then # train model, all `ckpt` under `exp` dir - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path} ${ckpt} + CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then diff --git a/utils/format_data.py b/utils/format_data.py index f9b5e6aa..2fa1924a 100755 --- a/utils/format_data.py +++ b/utils/format_data.py @@ -87,15 +87,24 @@ def main(): tokens = text_feature.tokenize(line) tokenids = text_feature.featurize(line) output_json['output'].append({ - 'name': 'traget1', + 'name': 'target1', 'shape': (len(tokenids), vocab_size), 'text': line, 'token': ' '.join(tokens), 'tokenid': ' '.join(map(str, tokenids)), }) else: - # isinstance(line, list), multi target - raise NotImplementedError("not support multi output now!") + # isinstance(line, list), multi target in one vocab + for i, item in enumerate(line, 1): + tokens = text_feature.tokenize(item) + tokenids = text_feature.featurize(item) + output_json['output'].append({ + 'name': f'target{i}', + 'shape': (len(tokenids), vocab_size), + 'text': item, + 'token': ' '.join(tokens), + 'tokenid': ' '.join(map(str, tokenids)), + }) # input line = line_json['feat']