From 2fe92b4b34bbddecca2954b9f1e2093e3631ffae Mon Sep 17 00:00:00 2001
From: chenfeiyu <chenfeiyu@baidu.com>
Date: Wed, 19 May 2021 15:06:06 +0800
Subject: [PATCH] change output format

---
 examples/chinese_g2p/README.md                |  2 +
 ...act_pinyin.py => convert_transcription.py} |  3 +-
 .../chinese_g2p/local/extract_pinyin_label.py | 37 +++++++++++++++++++
 examples/chinese_g2p/local/prepare_dataset.sh |  3 +-
 examples/chinese_g2p/run.sh                   |  6 ++-
 5 files changed, 46 insertions(+), 5 deletions(-)
 rename examples/chinese_g2p/local/{extract_pinyin.py => convert_transcription.py} (95%)
 create mode 100644 examples/chinese_g2p/local/extract_pinyin_label.py

diff --git a/examples/chinese_g2p/README.md b/examples/chinese_g2p/README.md
index 8855d37a9..e3fdfe684 100644
--- a/examples/chinese_g2p/README.md
+++ b/examples/chinese_g2p/README.md
@@ -1,3 +1,5 @@
 # Download Baker dataset
 
 Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
+
+Download URL https://test.data-baker.com/#/data/index/source.
diff --git a/examples/chinese_g2p/local/extract_pinyin.py b/examples/chinese_g2p/local/convert_transcription.py
similarity index 95%
rename from examples/chinese_g2p/local/extract_pinyin.py
rename to examples/chinese_g2p/local/convert_transcription.py
index 5f2c663bd..b133ad2c5 100644
--- a/examples/chinese_g2p/local/extract_pinyin.py
+++ b/examples/chinese_g2p/local/convert_transcription.py
@@ -24,7 +24,6 @@ def extract_pinyin(source, target, use_jieba=False):
         with open(target, 'wt', encoding='utf-8') as fout:
             for i, line in enumerate(fin):
                 if i % 2 == 0:
-                    fout.write(line)
                     sentence_id, raw_text = line.strip().split()
                     raw_text = re.sub(r'#\d', '', raw_text)
                     if use_jieba:
@@ -35,7 +34,7 @@ def extract_pinyin(source, target, use_jieba=False):
                         style=Style.TONE3,
                         neutral_tone_with_five=True)
                     transcription = ' '.join(syllables)
-                    fout.write(f'\t{transcription}\n')
+                    fout.write(f'{sentence_id}\t{transcription}\n')
                 else:
                     continue
 
diff --git a/examples/chinese_g2p/local/extract_pinyin_label.py b/examples/chinese_g2p/local/extract_pinyin_label.py
new file mode 100644
index 000000000..be7b287f4
--- /dev/null
+++ b/examples/chinese_g2p/local/extract_pinyin_label.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+
+def extract_pinyin_lables(source, target):
+    """Extract pinyin labels from Baker's prosody labeling."""
+    with open(source, 'rt', encoding='utf-8') as fin:
+        with open(target, 'wt', encoding='utf-8') as fout:
+            for i, line in enumerate(fin):
+                if i % 2 == 0:
+                    sentence_id, raw_text = line.strip().split()
+                    fout.write(f'{sentence_id}\t')
+                else:
+                    transcription = line.strip()
+                    fout.write(f'{transcription}\n')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="extract baker pinyin labels")
+    parser.add_argument(
+        "input", type=str, help="source file of baker's prosody label file")
+    parser.add_argument(
+        "output", type=str, help="target file to write pinyin lables")
+    args = parser.parse_args()
+    extract_pinyin_lables(args.input, args.output)
diff --git a/examples/chinese_g2p/local/prepare_dataset.sh b/examples/chinese_g2p/local/prepare_dataset.sh
index 7ef811e51..fe9948ed3 100644
--- a/examples/chinese_g2p/local/prepare_dataset.sh
+++ b/examples/chinese_g2p/local/prepare_dataset.sh
@@ -21,7 +21,8 @@ fi
 label_file='ProsodyLabeling/000001-010000.txt'
 filename='000001-010000.txt'
 unrar e ${archive} ${label_file}
-mv ${filename} ${exp_dir}
+cp ${filename} ${exp_dir}
+rm -f ${filename}
 
 if [ ! -f ${exp_dir}/${filename} ];then
     echo "File extraction failed!"
diff --git a/examples/chinese_g2p/run.sh b/examples/chinese_g2p/run.sh
index b5f55c19f..6bde2e264 100644
--- a/examples/chinese_g2p/run.sh
+++ b/examples/chinese_g2p/run.sh
@@ -13,8 +13,10 @@ bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
 # convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
 filename="000001-010000.txt"
 echo "Processing transcriptions..."
-python3 local/extract_pinyin.py ${exp_dir}/${filename} ${exp_dir}/"pypinyin_result.txt"
-python3 local/extract_pinyin.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"pypinyin_with_jieba_result.txt"
+
+python3 local/extract_pinyin_label.py ${exp_dir}/${filename} ${exp_dir}/"pinyin_baker.py"
+python3 local/convert_transcription.py ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin.txt"
+python3 local/convert_transcription.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"result_pypinyin_with_jieba.txt"
 
 echo "done"
 exit 0