From d5c3956c944dd63f4318de1c241aaacda88748f3 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 13 May 2021 19:20:09 +0800
Subject: [PATCH] speech text process docs (#607)

* add more speech doc

* fix doc path and mergify

* format doc
---
 .mergify.yml                                  |   6 +++---
 {docs => doc}/images/multi_gpu_speedup.png    | Bin
 {docs => doc}/images/prosody.jpeg             | Bin
 {docs => doc}/images/tuning_error_surface.png | Bin
 {docs => doc}/src/asr_postprocess.md          |  16 +++++++++-------
 {docs => doc}/src/augmentation.md             |   0
 {docs => doc}/src/benchmark.md                |   0
 {docs => doc}/src/chinese_syllable.md         |   0
 {docs => doc}/src/data_preparation.md         |   0
 doc/src/dataset.md                            |  15 +++++++++++++++
 {docs => doc}/src/faq.md                      |   0
 {docs => doc}/src/getting_started.md          |   0
 {docs => doc}/src/install.md                  |   0
 {docs => doc}/src/ngram_lm.md                 |   2 --
 {docs => doc}/src/reference.md                |   0
 {docs => doc}/src/released_model.md           |   0
 {docs => doc}/src/server.md                   |   2 +-
 {docs => doc}/src/speech_synthesis.md         |   0
 {docs => doc}/src/text_front_end.md           |  11 +++++++++++
 tools/Makefile                                |   2 +-
 20 files changed, 40 insertions(+), 14 deletions(-)
 rename {docs => doc}/images/multi_gpu_speedup.png (100%)
 rename {docs => doc}/images/prosody.jpeg (100%)
 rename {docs => doc}/images/tuning_error_surface.png (100%)
 rename {docs => doc}/src/asr_postprocess.md (83%)
 rename {docs => doc}/src/augmentation.md (100%)
 rename {docs => doc}/src/benchmark.md (100%)
 rename {docs => doc}/src/chinese_syllable.md (100%)
 rename {docs => doc}/src/data_preparation.md (100%)
 create mode 100644 doc/src/dataset.md
 rename {docs => doc}/src/faq.md (100%)
 rename {docs => doc}/src/getting_started.md (100%)
 rename {docs => doc}/src/install.md (100%)
 rename {docs => doc}/src/ngram_lm.md (99%)
 rename {docs => doc}/src/reference.md (100%)
 rename {docs => doc}/src/released_model.md (100%)
 rename {docs => doc}/src/server.md (99%)
 rename {docs => doc}/src/speech_synthesis.md (100%)
 rename {docs => doc}/src/text_front_end.md (86%)

diff --git a/.mergify.yml b/.mergify.yml
index 255276858..5a1e1ff34 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -47,19 +47,19 @@ pull_request_rules:
         add: ["README"]
   - name: "auto add label=Documentation"
     conditions:
-      - files~=^docs/
+      - files~=^doc/
     actions:
       label:
         add: ["Documentation"]
   - name: "auto add label=CI"
     conditions:
-      - files~=^(.circleci/|ci/|.github/|.travis.yml)
+      - files~=^(.circleci/|ci/|.github/|.travis.yml|.travis|env.sh)
     actions:
       label:
         add: ["CI"]
   - name: "auto add label=Installation"
     conditions:
-      - files~=^(tools/|setup.py|setup.sh|env.sh|.travis)
+      - files~=^(tools/|setup.py|setup.sh)
     actions:
       label:
         add: ["Installation"]
diff --git a/docs/images/multi_gpu_speedup.png b/doc/images/multi_gpu_speedup.png
similarity index 100%
rename from docs/images/multi_gpu_speedup.png
rename to doc/images/multi_gpu_speedup.png
diff --git a/docs/images/prosody.jpeg b/doc/images/prosody.jpeg
similarity index 100%
rename from docs/images/prosody.jpeg
rename to doc/images/prosody.jpeg
diff --git a/docs/images/tuning_error_surface.png b/doc/images/tuning_error_surface.png
similarity index 100%
rename from docs/images/tuning_error_surface.png
rename to doc/images/tuning_error_surface.png
diff --git a/docs/src/asr_postprocess.md b/doc/src/asr_postprocess.md
similarity index 83%
rename from docs/src/asr_postprocess.md
rename to doc/src/asr_postprocess.md
index 772bf8b25..0c84181d1 100644
--- a/docs/src/asr_postprocess.md
+++ b/doc/src/asr_postprocess.md
@@ -1,8 +1,9 @@
 # ASR PostProcess
 
-* Text Corrector
-* Text Filter
-* Add Punctuation
+1. [Text Segmentation](text_front_end#text segmentation)
+2. Text Corrector
+3. Add Punctuation
+4. Text Filter
 
 
 
@@ -10,6 +11,7 @@
 
 * [pycorrector](https://github.com/shibing624/pycorrector)
   本项目重点解决其中的谐音、混淆音、形似字错误、中文拼音全拼、语法错误带来的纠错任务。PS：[网友源码解读](https://zhuanlan.zhihu.com/p/138981644)
+* DeepCorrection [1](https://praneethbedapudi.medium.com/deepcorrection-1-sentence-segmentation-of-unpunctuated-text-a1dbc0db4e98) [2](https://praneethbedapudi.medium.com/deepcorrection2-automatic-punctuation-restoration-ac4a837d92d9) [3](https://praneethbedapudi.medium.com/deepcorrection-3-spell-correction-and-simple-grammar-correction-d033a52bc11d)  [4](https://praneethbedapudi.medium.com/deepsegment-2-0-multilingual-text-segmentation-with-vector-alignment-fd76ce62194f)
 
 
 
@@ -88,12 +90,12 @@
 
 
 
-## Text Filter
-
-* 敏感词（黄暴、涉政、违法违禁等）
+## Add Punctuation
 
+* DeepCorrection [1](https://praneethbedapudi.medium.com/deepcorrection-1-sentence-segmentation-of-unpunctuated-text-a1dbc0db4e98) [2](https://praneethbedapudi.medium.com/deepcorrection2-automatic-punctuation-restoration-ac4a837d92d9) [3](https://praneethbedapudi.medium.com/deepcorrection-3-spell-correction-and-simple-grammar-correction-d033a52bc11d)  [4](https://praneethbedapudi.medium.com/deepsegment-2-0-multilingual-text-segmentation-with-vector-alignment-fd76ce62194f)
 
 
 
+## Text Filter
 
-## Add Punctuation
+* 敏感词（黄暴、涉政、违法违禁等）
diff --git a/docs/src/augmentation.md b/doc/src/augmentation.md
similarity index 100%
rename from docs/src/augmentation.md
rename to doc/src/augmentation.md
diff --git a/docs/src/benchmark.md b/doc/src/benchmark.md
similarity index 100%
rename from docs/src/benchmark.md
rename to doc/src/benchmark.md
diff --git a/docs/src/chinese_syllable.md b/doc/src/chinese_syllable.md
similarity index 100%
rename from docs/src/chinese_syllable.md
rename to doc/src/chinese_syllable.md
diff --git a/docs/src/data_preparation.md b/doc/src/data_preparation.md
similarity index 100%
rename from docs/src/data_preparation.md
rename to doc/src/data_preparation.md
diff --git a/doc/src/dataset.md b/doc/src/dataset.md
new file mode 100644
index 000000000..231773a9b
--- /dev/null
+++ b/doc/src/dataset.md
@@ -0,0 +1,15 @@
+# Dataset
+
+## Text
+
+* [Tatoeba](https://tatoeba.org/cmn)
+
+  **Tatoeba is a collection of sentences and translations.** It's collaborative, open, free and even addictive. An open data initiative aimed at translation and speech recognition.
+
+
+
+## Speech
+
+* [Tatoeba](https://tatoeba.org/cmn)
+
+  **Tatoeba is a collection of sentences and translations.** It's collaborative, open, free and even addictive. An open data initiative aimed at translation and speech recognition.
diff --git a/docs/src/faq.md b/doc/src/faq.md
similarity index 100%
rename from docs/src/faq.md
rename to doc/src/faq.md
diff --git a/docs/src/getting_started.md b/doc/src/getting_started.md
similarity index 100%
rename from docs/src/getting_started.md
rename to doc/src/getting_started.md
diff --git a/docs/src/install.md b/doc/src/install.md
similarity index 100%
rename from docs/src/install.md
rename to doc/src/install.md
diff --git a/docs/src/ngram_lm.md b/doc/src/ngram_lm.md
similarity index 99%
rename from docs/src/ngram_lm.md
rename to doc/src/ngram_lm.md
index 4dc92cd9f..7bf21f894 100644
--- a/docs/src/ngram_lm.md
+++ b/doc/src/ngram_lm.md
@@ -86,5 +86,3 @@ Please notice that the released language models only contain Chinese simplified
    ```
    build/bin/build_binary ./result/people2014corpus_words.arps ./result/people2014corpus_words.klm
    ```
-
-
diff --git a/docs/src/reference.md b/doc/src/reference.md
similarity index 100%
rename from docs/src/reference.md
rename to doc/src/reference.md
diff --git a/docs/src/released_model.md b/doc/src/released_model.md
similarity index 100%
rename from docs/src/released_model.md
rename to doc/src/released_model.md
diff --git a/docs/src/server.md b/doc/src/server.md
similarity index 99%
rename from docs/src/server.md
rename to doc/src/server.md
index 019ebcfa4..4918d5ebe 100644
--- a/docs/src/server.md
+++ b/doc/src/server.md
@@ -25,7 +25,7 @@ Then to start the client, please run this in another console:
 
 ```bash
 CUDA_VISIBLE_DEVICES=0 bash local/client.sh
-```  
+```
 
 Now, in the client console, press the `whitespace` key, hold, and start speaking. Until finishing your utterance, release the key to let the speech-to-text results shown in the console. To quit the client, just press `ESC` key.
 
diff --git a/docs/src/speech_synthesis.md b/doc/src/speech_synthesis.md
similarity index 100%
rename from docs/src/speech_synthesis.md
rename to doc/src/speech_synthesis.md
diff --git a/docs/src/text_front_end.md b/doc/src/text_front_end.md
similarity index 86%
rename from docs/src/text_front_end.md
rename to doc/src/text_front_end.md
index 5d53f5137..01b608591 100644
--- a/docs/src/text_front_end.md
+++ b/doc/src/text_front_end.md
@@ -1,5 +1,16 @@
 # Text Front End
 
+
+
+## Text Segmentation
+
+There are various libraries including some of the most popular ones like NLTK, Spacy, Stanford CoreNLP that that provide excellent, easy to use functions for sentence segmentation.
+
+* https://github.com/bminixhofer/nnsplit
+* [DeepSegment](https://github.com/notAI-tech/deepsegment)  [blog](http://bpraneeth.com/projects/deepsegment) [1](https://praneethbedapudi.medium.com/deepcorrection-1-sentence-segmentation-of-unpunctuated-text-a1dbc0db4e98) [2](https://praneethbedapudi.medium.com/deepcorrection2-automatic-punctuation-restoration-ac4a837d92d9) [3](https://praneethbedapudi.medium.com/deepcorrection-3-spell-correction-and-simple-grammar-correction-d033a52bc11d)  [4](https://praneethbedapudi.medium.com/deepsegment-2-0-multilingual-text-segmentation-with-vector-alignment-fd76ce62194f)
+
+
+
 ## Text Normalization(文本正则)
 
 文本正则化 文本正则化主要是讲非标准词(NSW)进行转化，比如：  
diff --git a/tools/Makefile b/tools/Makefile
index ef721c2b8..ea57cd2c0 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,4 +1,4 @@
-PYTHON:= python3.7
+PYTHON:= python3.8
 .PHONY: all clean
 
 all: virtualenv