From 29009089ec4380af7e13f76e918b6a8c09b3a1a0 Mon Sep 17 00:00:00 2001
From: iclementine <iclementine@outlook.com>
Date: Tue, 18 May 2021 16:55:25 +0800
Subject: [PATCH] 1. remove script for data downloading, since Baker dataset is
 not easily downloaded via terminal; 2. remove pypinyin as an extra
 requirement; it is alreay required by the main project; 3. clean code.

---
 examples/chinese_g2p/README.md                |  3 ++
 examples/chinese_g2p/data/README.md           |  3 --
 examples/chinese_g2p/local/data_download.sh   | 21 --------
 examples/chinese_g2p/local/extract_pinyin.py  | 51 +++++++++++++------
 examples/chinese_g2p/local/prepare_dataset.sh | 31 +++++++++++
 examples/chinese_g2p/requirements.txt         |  1 -
 examples/chinese_g2p/run.sh                   | 35 ++++---------
 7 files changed, 79 insertions(+), 66 deletions(-)
 create mode 100644 examples/chinese_g2p/README.md
 delete mode 100644 examples/chinese_g2p/data/README.md
 delete mode 100644 examples/chinese_g2p/local/data_download.sh
 create mode 100644 examples/chinese_g2p/local/prepare_dataset.sh

diff --git a/examples/chinese_g2p/README.md b/examples/chinese_g2p/README.md
new file mode 100644
index 000000000..8855d37a9
--- /dev/null
+++ b/examples/chinese_g2p/README.md
@@ -0,0 +1,3 @@
+# Download Baker dataset
+
+Baker dataset has to be downloaded mannually and moved to 'data/', because you will have to pass the CATTCHA from a browswe to download the dataset.
diff --git a/examples/chinese_g2p/data/README.md b/examples/chinese_g2p/data/README.md
deleted file mode 100644
index 2e25312c2..000000000
--- a/examples/chinese_g2p/data/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Download Baker dataset
-
-Baker dataset has to be downloaded mannually and move to this folder, because you will have to pass the CATTCHA from a browswe to download the dataset.
diff --git a/examples/chinese_g2p/local/data_download.sh b/examples/chinese_g2p/local/data_download.sh
deleted file mode 100644
index 13fe0ac84..000000000
--- a/examples/chinese_g2p/local/data_download.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-
-. ${MAIN_ROOT}/utils/utility.sh
-
-DOWNLOAD_DIR=$(dirname $0)/../data
-mkdir -p ${DOWNLOAD_DIR}
-
-# you may need to pass the authentification to download the data via a browser
-URL=https://online-of-baklong.oss-cn-huhehaote.aliyuncs.com/story_resource/BZNSYP.rar
-
-MD5="c4350563bf7dc298f7dd364b2607be83"
-TARGET=${DOWNLOAD_DIR}/BZNSYP.rar
-
-echo "Download Baker TTS dataset..."
-download ${URL} ${MD5} ${TARGET}
-if [ $? -ne 0 ]; then
-    echo "Fail to downlaod Baker TTS dataset!"
-    exit
-fi
-
-exit 0
diff --git a/examples/chinese_g2p/local/extract_pinyin.py b/examples/chinese_g2p/local/extract_pinyin.py
index 4b806862e..5f2c663bd 100644
--- a/examples/chinese_g2p/local/extract_pinyin.py
+++ b/examples/chinese_g2p/local/extract_pinyin.py
@@ -1,33 +1,54 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 import re
+
 import jieba
-import pypinyin
-from pypinyin import lazy_pinyin, Style
+from pypinyin import lazy_pinyin
+from pypinyin import Style
+
 
 def extract_pinyin(source, target, use_jieba=False):
-    with open(source, 'rt', encoding='utf-8') as f:
-        with open(target, 'wt', encoding='utf-8') as g:
-            for i, line in enumerate(f):
+    with open(source, 'rt', encoding='utf-8') as fin:
+        with open(target, 'wt', encoding='utf-8') as fout:
+            for i, line in enumerate(fin):
                 if i % 2 == 0:
-                    g.write(line)
+                    fout.write(line)
                     sentence_id, raw_text = line.strip().split()
                     raw_text = re.sub(r'#\d', '', raw_text)
                     if use_jieba:
                         raw_text = jieba.lcut(raw_text)
-                    syllables = lazy_pinyin(raw_text, errors='ignore', style=Style.TONE3, neutral_tone_with_five=True)
+                    syllables = lazy_pinyin(
+                        raw_text,
+                        errors='ignore',
+                        style=Style.TONE3,
+                        neutral_tone_with_five=True)
                     transcription = ' '.join(syllables)
-                    g.write(f'\t{transcription}\n')
+                    fout.write(f'\t{transcription}\n')
                 else:
                     continue
-                    
-                
-        
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="extract baker pinyin labels")
-    parser.add_argument("input", type=str, help="source file of baker's prosody label file")
-    parser.add_argument("output", type=str, help="target file to write pinyin lables")
-    parser.add_argument("--use-jieba", action='store_true', help="use jieba for word segmentation.")
+    parser.add_argument(
+        "input", type=str, help="source file of baker's prosody label file")
+    parser.add_argument(
+        "output", type=str, help="target file to write pinyin lables")
+    parser.add_argument(
+        "--use-jieba",
+        action='store_true',
+        help="use jieba for word segmentation.")
     args = parser.parse_args()
-    print(args)
     extract_pinyin(args.input, args.output, use_jieba=args.use_jieba)
diff --git a/examples/chinese_g2p/local/prepare_dataset.sh b/examples/chinese_g2p/local/prepare_dataset.sh
new file mode 100644
index 000000000..7ef811e51
--- /dev/null
+++ b/examples/chinese_g2p/local/prepare_dataset.sh
@@ -0,0 +1,31 @@
+echo "Extracting Prosody Labeling"
+
+exp_dir="exp"
+data_dir="data"
+source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
+
+archive=${data_dir}/"BZNSYP.rar"
+if [ ! -f ${archive} ]; then
+    echo "Baker Dataset not found! Download it first to the data_dir."
+    exit -1
+fi
+
+MD5='c4350563bf7dc298f7dd364b2607be83'
+md5_result=$(md5sum ${archive} | awk -F[' '] '{print $1}')
+if [ ${md5_result} != ${MD5} ]; then
+    echo "MD5 mismatch! The Archive has been changed."
+    exit -1
+fi
+
+   
+label_file='ProsodyLabeling/000001-010000.txt'
+filename='000001-010000.txt'
+unrar e ${archive} ${label_file}
+mv ${filename} ${exp_dir}
+
+if [ ! -f ${exp_dir}/${filename} ];then
+    echo "File extraction failed!"
+    exit
+fi
+
+exit 0
diff --git a/examples/chinese_g2p/requirements.txt b/examples/chinese_g2p/requirements.txt
index 3d5d90d32..c84f42278 100644
--- a/examples/chinese_g2p/requirements.txt
+++ b/examples/chinese_g2p/requirements.txt
@@ -1,2 +1 @@
 jieba
-pypinyin
diff --git a/examples/chinese_g2p/run.sh b/examples/chinese_g2p/run.sh
index ea394b005..b5f55c19f 100644
--- a/examples/chinese_g2p/run.sh
+++ b/examples/chinese_g2p/run.sh
@@ -1,37 +1,20 @@
 #!/usr/bin/env bash
 source path.sh
 
-stage=0
+stage=-1
 stop_stage=100
 
+exp_dir="exp"
+data_dir="data"
 source ${MAIN_ROOT}/utils/parse_options.sh || exit -1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    bash local/data_download.sh
-    if [ $? -ne 0 ]; then
-	exit 1
-    fi
-fi
-
-EXP_DIR="exp"
-mkdir -p ${EXP_DIR}
-
-ARCHIVE="data/BZNSYP.rar"
-
-echo "Extracting Prosody Labeling"
-LABEL_FILE='ProsodyLabeling/000001-010000.txt'
-FILENAME='000001-010000.txt'
-unrar e ${ARCHIVE} ${LABEL_FILE}
-mv ${FILENAME} ${EXP_DIR}
-
-if [ ! -f ${EXP_DIR}/${FILENAME} ];then
-    echo "File extraction failed!"
-    exit
-fi
+mkdir -p ${exp_dir}
+bash local/prepare_dataset.sh --exp-dir ${exp_dir} --data-dir ${data_dir}
 
 # convert transcription in chinese into pinyin with pypinyin or jieba+pypinyin
-python3 local/extract_pinyin.py ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_result.txt"
-python3 local/extract_pinyin.py --use-jieba ${EXP_DIR}/${FILENAME} ${EXP_DIR}/"pypinyin_with_jieba_result.txt"
+filename="000001-010000.txt"
+echo "Processing transcriptions..."
+python3 local/extract_pinyin.py ${exp_dir}/${filename} ${exp_dir}/"pypinyin_result.txt"
+python3 local/extract_pinyin.py --use-jieba ${exp_dir}/${filename} ${exp_dir}/"pypinyin_with_jieba_result.txt"
 
 echo "done"
 exit 0