From b944418d6ffb0fe492185cca2577e9d00d946ce7 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 22 Nov 2021 11:11:27 +0000 Subject: [PATCH] new format data support ds2/st --- examples/dataset/ted_en_zh/ted_en_zh.py | 7 +++++-- examples/ted_en_zh/t0/local/data.sh | 6 +++--- paddlespeech/s2t/io/collator.py | 11 ++++++----- paddlespeech/s2t/io/dataset.py | 2 +- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/examples/dataset/ted_en_zh/ted_en_zh.py b/examples/dataset/ted_en_zh/ted_en_zh.py index 14bef01d2..a8cbb8379 100644 --- a/examples/dataset/ted_en_zh/ted_en_zh.py +++ b/examples/dataset/ted_en_zh/ted_en_zh.py @@ -72,14 +72,17 @@ def create_manifest(data_dir, manifest_path_prefix): continue audio_data, samplerate = soundfile.read(audio_path) duration = float(len(audio_data) / samplerate) + + + translation_str = " ".join(translation.split()) + trancription_str = " ".join(trancription.split()) json_lines.append( json.dumps( { 'utt': utt, 'feat': audio_path, 'feat_shape': (duration, ), # second - 'text': " ".join(translation.split()), - 'text1': " ".join(trancription.split()) + 'text': [translation_str, trancription_str], }, ensure_ascii=False)) diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index ce58f539d..d3acbd448 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -9,7 +9,7 @@ stop_stage=100 nbpe=8000 bpemode=unigram bpeprefix="data/bpe_${bpemode}_${nbpe}" -data_dir=./TED_EnZh +data_dir=./TED-En-Zh source ${MAIN_ROOT}/utils/parse_options.sh @@ -21,7 +21,7 @@ mkdir -p data if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ! -e ${data_dir} ]; then - echo "Error: Dataset is not avaiable. Please download and unzip the dataset" + echo "Error: ${data_dir} Dataset is not avaiable. Please download and unzip the dataset" echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0" echo "The tree of the directory should be:" echo "." @@ -88,7 +88,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then # format manifest with tokenids, vocab size for set in train dev test; do { - python3 ${MAIN_ROOT}/utils/format_triplet_data.py \ + python3 ${MAIN_ROOT}/utils/format_data.py \ --cmvn_path "data/mean_std.json" \ --unit_type "spm" \ --spm_model_prefix ${bpeprefix} \ diff --git a/paddlespeech/s2t/io/collator.py b/paddlespeech/s2t/io/collator.py index a500f10c9..35b868718 100644 --- a/paddlespeech/s2t/io/collator.py +++ b/paddlespeech/s2t/io/collator.py @@ -237,8 +237,8 @@ class SpeechCollatorBase(): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - text = item['text'] + audio = item['input'][0]['feat'] + text = item['output'][0]['text'] audio, text = self.process_utterance(audio, text) audios.append(audio) # [T, D] @@ -381,9 +381,10 @@ class TripletSpeechCollator(SpeechCollator): for idx, item in enumerate(batch): utts.append(item['utt']) - audio = item['feat'] - translation = item['text'] - transcription = item['text1'] + audio = item['input'][0]['feat'] + translation = item['output'][0]['text'] + transcription = item['output'][1]['text'] + audio, translation, transcription = self.process_utterance( audio, translation, transcription) diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py index 7007518da..c5df2d6bd 100644 --- a/paddlespeech/s2t/io/dataset.py +++ b/paddlespeech/s2t/io/dataset.py @@ -122,7 +122,7 @@ class ManifestDataset(Dataset): min_output_len=min_output_len, max_output_input_ratio=max_output_input_ratio, min_output_input_ratio=min_output_input_ratio) - self._manifest.sort(key=lambda x: x["feat_shape"][0]) + self._manifest.sort(key=lambda x: x["input"][0]["shape"][0]) def __len__(self): return len(self._manifest)