From 02c7ef319898e33650aad90759c98712d3777cad Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 22 Nov 2021 10:30:54 +0000
Subject: [PATCH] format data support multi output

---
 examples/ted_en_zh/t0/run.sh |  3 ++-
 utils/format_data.py         | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/examples/ted_en_zh/t0/run.sh b/examples/ted_en_zh/t0/run.sh
index e9f4a058..654d4dce 100755
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
@@ -2,6 +2,7 @@
 set -e
 source path.sh
 
+gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/transformer_joint_noam.yaml
@@ -21,7 +22,7 @@ fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
 fi
 
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
diff --git a/utils/format_data.py b/utils/format_data.py
index f9b5e6aa..2fa1924a 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -87,15 +87,24 @@ def main():
                 tokens = text_feature.tokenize(line)
                 tokenids = text_feature.featurize(line)
                 output_json['output'].append({
-                    'name': 'traget1',
+                    'name': 'target1',
                     'shape': (len(tokenids), vocab_size),
                     'text': line,
                     'token': ' '.join(tokens),
                     'tokenid': ' '.join(map(str, tokenids)),
                 })
             else:
-                # isinstance(line, list), multi target 
-                raise NotImplementedError("not support multi output now!")
+                # isinstance(line, list), multi target in one vocab
+                for i, item in enumerate(line, 1):
+                    tokens = text_feature.tokenize(item)
+                    tokenids = text_feature.featurize(item)
+                    output_json['output'].append({
+                        'name': f'target{i}',
+                        'shape': (len(tokenids), vocab_size),
+                        'text': item,
+                        'token': ' '.join(tokens),
+                        'tokenid': ' '.join(map(str, tokenids)),
+                    })
 
             # input
             line = line_json['feat']