From b944418d6ffb0fe492185cca2577e9d00d946ce7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 22 Nov 2021 11:11:27 +0000
Subject: [PATCH] new format data support ds2/st

---
 examples/dataset/ted_en_zh/ted_en_zh.py |  7 +++++--
 examples/ted_en_zh/t0/local/data.sh     |  6 +++---
 paddlespeech/s2t/io/collator.py         | 11 ++++++-----
 paddlespeech/s2t/io/dataset.py          |  2 +-
 4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py
index 14bef01d2..a8cbb8379 100644
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
@@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix):
                     continue
                 audio_data, samplerate = soundfile.read(audio_path)
                 duration = float(len(audio_data) / samplerate)
+
+
+                translation_str = " ".join(translation.split())
+                trancription_str = " ".join(trancription.split())
                 json_lines.append(
                     json.dumps(
                         {
                             'utt': utt,
                             'feat': audio_path,
                             'feat_shape': (duration, ),  # second
-                            'text': " ".join(translation.split()),
-                            'text1': " ".join(trancription.split())
+                            'text': [translation_str, trancription_str],  
                         },
                         ensure_ascii=False))
 
diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh
index ce58f539d..d3acbd448 100755
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@@ -9,7 +9,7 @@ stop_stage=100
 nbpe=8000
 bpemode=unigram
 bpeprefix="data/bpe_${bpemode}_${nbpe}"
-data_dir=./TED_EnZh
+data_dir=./TED-En-Zh
 
 
 source ${MAIN_ROOT}/utils/parse_options.sh
@@ -21,7 +21,7 @@ mkdir -p data
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
     if [ ! -e ${data_dir} ]; then
-        echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+        echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset"
         echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
         echo "The tree of the directory should be:"
         echo "."
@@ -88,7 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # format manifest with tokenids, vocab size
     for set in train dev test; do
     {
-        python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
+        python3 ${MAIN_ROOT}/utils/format_data.py \
         --cmvn_path "data/mean_std.json" \
         --unit_type "spm" \
         --spm_model_prefix ${bpeprefix} \
diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py
index a500f10c9..35b868718 100644
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
@@ -237,8 +237,8 @@ class SpeechCollatorBase():
         for idx, item in enumerate(batch):
             utts.append(item['utt'])
 
-            audio = item['feat']
-            text = item['text']
+            audio = item['input'][0]['feat']
+            text = item['output'][0]['text']
             audio, text = self.process_utterance(audio, text)
 
             audios.append(audio)  # [T, D]
@@ -381,9 +381,10 @@ class TripletSpeechCollator(SpeechCollator):
         for idx, item in enumerate(batch):
             utts.append(item['utt'])
 
-            audio = item['feat']
-            translation = item['text']
-            transcription = item['text1']
+            audio = item['input'][0]['feat']
+            translation = item['output'][0]['text']
+            transcription = item['output'][1]['text']
+
             audio, translation, transcription = self.process_utterance(
                 audio, translation, transcription)
 
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 7007518da..c5df2d6bd 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -122,7 +122,7 @@ class ManifestDataset(Dataset):
             min_output_len=min_output_len,
             max_output_input_ratio=max_output_input_ratio,
             min_output_input_ratio=min_output_input_ratio)
-        self._manifest.sort(key=lambda x: x["feat_shape"][0])
+        self._manifest.sort(key=lambda x: x["input"][0]["shape"][0])
 
     def __len__(self):
         return len(self._manifest)