diff --git a/README.md b/README.md index 32e1c23d8..8eb4cbf1e 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision - 🧩 *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV). ### Recent Update +- 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid). - 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation. - 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), Support ASR and Feature Extraction. - 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660). diff --git a/README_cn.md b/README_cn.md index 427d59caf..a74fc2476 100644 --- a/README_cn.md +++ b/README_cn.md @@ -164,7 +164,8 @@ ### 近期更新 -- 👑 2022.11.18: 新增 [Whisper CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640),支持多种语言的识别与翻译。 +- 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。 +- 👑 2022.11.18: 新增 [Whisper CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), 支持多种语言的识别与翻译。 - 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), 支持 ASR 和 特征提取. - 🎉 2022.11.17: TTS 新增[高质量男性音色](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660)。 - 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/speechx/examples/u2pp_ol/wenetspeech)。 diff --git a/demos/TTSAndroid/.gitignore b/demos/TTSAndroid/.gitignore new file mode 100644 index 000000000..2b75303ac --- /dev/null +++ b/demos/TTSAndroid/.gitignore @@ -0,0 +1,13 @@ +*.iml +.gradle +/local.properties +/.idea/caches +/.idea/libraries +/.idea/modules.xml +/.idea/workspace.xml +/.idea/navEditor.xml +/.idea/assetWizardSettings.xml +.DS_Store +/build +/captures +.externalNativeBuild diff --git a/demos/TTSAndroid/README.md b/demos/TTSAndroid/README.md new file mode 100644 index 000000000..d60135620 --- /dev/null +++ b/demos/TTSAndroid/README.md @@ -0,0 +1,189 @@ +# 语音合成 Java API Demo 使用指南 + +在 Android 上实现语音合成功能,此 Demo 有很好的的易用性和开放性,如在 Demo 中跑自己训练好的模型等。 + +本文主要介绍语音合成 Demo 运行方法。 + +## 如何运行语音合成 Demo + +### 环境准备 + +1. 在本地环境安装好 Android Studio 工具,详细安装方法请见 [Android Stuido 官网](https://developer.android.com/studio)。 +2. 准备一部 Android 手机,并开启 USB 调试模式。开启方法: `手机设置 -> 查找开发者选项 -> 打开开发者选项和 USB 调试模式`。 + +**注意**: +> 如果您的 Android Studio 尚未配置 NDK ,请根据 Android Studio 用户指南中的[安装及配置 NDK 和 CMake ](https://developer.android.com/studio/projects/install-ndk)内容,预先配置好 NDK 。您可以选择最新的 NDK 版本,或者使用 Paddle Lite 预测库版本一样的 NDK。 + +### 部署步骤 + +1. 用 Android Studio 打开 TTSAndroid 工程。 +2. 手机连接电脑,打开 USB 调试和文件传输模式,并在 Android Studio 上连接自己的手机设备(手机需要开启允许从 USB 安装软件权限)。 + +**注意:** +>1. 如果您在导入项目、编译或者运行过程中遇到 NDK 配置错误的提示,请打开 `File > Project Structure > SDK Location`,修改 `Andriod NDK location` 为您本机配置的 NDK 所在路径。 +>2. 如果您是通过 Andriod Studio 的 SDK Tools 下载的 NDK (见本章节"环境准备"),可以直接点击下拉框选择默认路径。 +>3. 还有一种 NDK 配置方法,你可以在 `TTSAndroid/local.properties` 文件中手动添加 NDK 路径配置 `nkd.dir=/root/android-ndk-r20b` +>4. 如果以上步骤仍旧无法解决 NDK 配置错误,请尝试根据 Andriod Studio 官方文档中的[更新 Android Gradle 插件](https://developer.android.com/studio/releases/gradle-plugin?hl=zh-cn#updating-plugin)章节,尝试更新 Android Gradle plugin 版本。 + +3. 点击 Run 按钮,自动编译 APP 并安装到手机。(该过程会自动下载 Paddle Lite 预测库和模型,需要联网) + 成功后效果如下: + - pic 1:APP 安装到手机。 + - pic 2:APP 打开后的效果,在下拉框中选择待合成的文本。 + - pic 3:合成后点击按钮播放音频。 + +
+
+
+
+
+ * This technique can be used with an {@link android.app.Activity} class, not just
+ * {@link android.preference.PreferenceActivity}.
+ */
+public abstract class AppCompatPreferenceActivity extends PreferenceActivity {
+ private AppCompatDelegate mDelegate;
+
+ @Override
+ protected void onCreate(Bundle savedInstanceState) {
+ getDelegate().installViewFactory();
+ getDelegate().onCreate(savedInstanceState);
+ super.onCreate(savedInstanceState);
+ }
+
+ @Override
+ protected void onPostCreate(Bundle savedInstanceState) {
+ super.onPostCreate(savedInstanceState);
+ getDelegate().onPostCreate(savedInstanceState);
+ }
+
+ public ActionBar getSupportActionBar() {
+ return getDelegate().getSupportActionBar();
+ }
+
+
+ @Override
+ public MenuInflater getMenuInflater() {
+ return getDelegate().getMenuInflater();
+ }
+
+ @Override
+ public void setContentView(@LayoutRes int layoutResID) {
+ getDelegate().setContentView(layoutResID);
+ }
+
+ @Override
+ public void setContentView(View view) {
+ getDelegate().setContentView(view);
+ }
+
+ @Override
+ public void setContentView(View view, ViewGroup.LayoutParams params) {
+ getDelegate().setContentView(view, params);
+ }
+
+ @Override
+ public void addContentView(View view, ViewGroup.LayoutParams params) {
+ getDelegate().addContentView(view, params);
+ }
+
+ @Override
+ protected void onPostResume() {
+ super.onPostResume();
+ getDelegate().onPostResume();
+ }
+
+ @Override
+ protected void onTitleChanged(CharSequence title, int color) {
+ super.onTitleChanged(title, color);
+ getDelegate().setTitle(title);
+ }
+
+ @Override
+ public void onConfigurationChanged(Configuration newConfig) {
+ super.onConfigurationChanged(newConfig);
+ getDelegate().onConfigurationChanged(newConfig);
+ }
+
+ @Override
+ protected void onStop() {
+ super.onStop();
+ getDelegate().onStop();
+ }
+
+ @Override
+ protected void onDestroy() {
+ super.onDestroy();
+ getDelegate().onDestroy();
+ }
+
+ public void invalidateOptionsMenu() {
+ getDelegate().invalidateOptionsMenu();
+ }
+
+ private AppCompatDelegate getDelegate() {
+ if (mDelegate == null) {
+ mDelegate = AppCompatDelegate.create(this, null);
+ }
+ return mDelegate;
+ }
+}
diff --git a/demos/TTSAndroid/app/src/main/java/com/baidu/paddle/lite/demo/tts/MainActivity.java b/demos/TTSAndroid/app/src/main/java/com/baidu/paddle/lite/demo/tts/MainActivity.java
new file mode 100644
index 000000000..4156c361b
--- /dev/null
+++ b/demos/TTSAndroid/app/src/main/java/com/baidu/paddle/lite/demo/tts/MainActivity.java
@@ -0,0 +1,400 @@
+package com.baidu.paddle.lite.demo.tts;
+
+import android.Manifest;
+import android.app.ProgressDialog;
+import android.content.Intent;
+import android.content.SharedPreferences;
+import android.content.pm.PackageManager;
+import android.media.MediaPlayer;
+import android.os.Bundle;
+import android.os.Environment;
+import android.os.Handler;
+import android.os.HandlerThread;
+import android.os.Message;
+import android.preference.PreferenceManager;
+import android.support.annotation.NonNull;
+import android.support.v4.app.ActivityCompat;
+import android.support.v4.content.ContextCompat;
+import android.support.v7.app.AppCompatActivity;
+import android.text.method.ScrollingMovementMethod;
+import android.util.Log;
+import android.view.Menu;
+import android.view.MenuInflater;
+import android.view.MenuItem;
+import android.view.View;
+import android.widget.AdapterView;
+import android.widget.ArrayAdapter;
+import android.widget.Button;
+import android.widget.Spinner;
+import android.widget.TextView;
+import android.widget.Toast;
+
+import java.io.File;
+import java.io.IOException;
+
+public class MainActivity extends AppCompatActivity implements View.OnClickListener, MediaPlayer.OnPreparedListener, MediaPlayer.OnErrorListener, AdapterView.OnItemSelectedListener {
+ public static final int REQUEST_LOAD_MODEL = 0;
+ public static final int REQUEST_RUN_MODEL = 1;
+ public static final int RESPONSE_LOAD_MODEL_SUCCESSED = 0;
+ public static final int RESPONSE_LOAD_MODEL_FAILED = 1;
+ public static final int RESPONSE_RUN_MODEL_SUCCESSED = 2;
+ public static final int RESPONSE_RUN_MODEL_FAILED = 3;
+ public MediaPlayer mediaPlayer = new MediaPlayer();
+ private static final String TAG = Predictor.class.getSimpleName();
+ protected ProgressDialog pbLoadModel = null;
+ protected ProgressDialog pbRunModel = null;
+ // Receive messages from worker thread
+ protected Handler receiver = null;
+ // Send command to worker thread
+ protected Handler sender = null;
+ // Worker thread to load&run model
+ protected HandlerThread worker = null;
+ // UI components of image classification
+ protected TextView tvInputSetting;
+ protected TextView tvInferenceTime;
+ protected Button btn_play;
+ protected Button btn_pause;
+ protected Button btn_stop;
+ // Model settings of image classification
+ protected String modelPath = "";
+ protected int cpuThreadNum = 1;
+ protected String cpuPowerMode = "";
+ protected Predictor predictor = new Predictor();
+ int sampleRate = 24000;
+ private final String wavName = "tts_output.wav";
+ private final String wavFile = Environment.getExternalStorageDirectory() + File.separator + wavName;
+ private final String AMmodelName = "fastspeech2_csmsc_arm.nb";
+ private final String VOCmodelName = "mb_melgan_csmsc_arm.nb";
+ private float[] phones = {};
+ private final float[][] sentencesToChoose = {
+ // 009901 昨日,这名“伤者”与医生全部被警方依法刑事拘留。
+ {261, 231, 175, 116, 179, 262, 44, 154, 126, 177, 19, 262, 42, 241, 72, 177, 56, 174, 245, 37, 186, 37, 49, 151, 127, 69, 19, 179, 72, 69, 4, 260, 126, 177, 116, 151, 239, 153, 141},
+ // 009902 钱伟长想到上海来办学校是经过深思熟虑的。
+ {174, 83, 213, 39, 20, 260, 89, 40, 30, 177, 22, 71, 9, 153, 8, 37, 17, 260, 251, 260, 99, 179, 177, 116, 151, 125, 70, 233, 177, 51, 176, 108, 177, 184, 153, 242, 40, 45},
+ // 009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。
+ {182, 2, 151, 85, 232, 73, 151, 123, 154, 52, 151, 143, 154, 5, 179, 39, 113, 69, 17, 177, 114, 105, 154, 5, 179, 154, 5, 40, 45, 232, 182, 8, 37, 186, 174, 74, 182, 168},
+ // 009904 李述德在离开之前,只说了一句“柱驼杀父亲了”。
+ {153, 74, 177, 186, 40, 42, 261, 10, 153, 73, 152, 7, 262, 113, 174, 83, 179, 262, 115, 177, 230, 153, 45, 73, 151, 242, 180, 262, 186, 182, 231, 177, 2, 69, 186, 174, 124, 153, 45},
+ // 009905 这种车票和保险单捆绑出售属于重复性购买。
+ {262, 44, 262, 163, 39, 41, 173, 99, 71, 42, 37, 28, 260, 84, 40, 14, 179, 152, 220, 37, 21, 39, 183, 177, 170, 179, 177, 185, 240, 39, 162, 69, 186, 260, 128, 70, 170, 154, 9},
+ // 009906 戴佩妮的男友西米露接唱情歌,让她非常开心。
+ {40, 10, 173, 49, 155, 72, 40, 45, 155, 15, 142, 260, 72, 154, 74, 153, 186, 179, 151, 103, 39, 22, 174, 126, 70, 41, 179, 175, 22, 182, 2, 69, 46, 39, 20, 152, 7, 260, 120},
+ // 009907 观大势、谋大局、出大策始终是该院的办院方针。
+ {70, 199, 40, 5, 177, 116, 154, 168, 40, 5, 151, 240, 179, 39, 183, 40, 5, 38, 44, 179, 177, 115, 262, 161, 177, 116, 70, 7, 247, 40, 45, 37, 17, 247, 69, 19, 262, 51},
+ // 009908 他们骑着摩托回家,正好为农忙时的父母帮忙。
+ {182, 2, 154, 55, 174, 73, 262, 45, 154, 157, 182, 230, 71, 212, 151, 77, 180, 262, 59, 71, 29, 214, 155, 162, 154, 20, 177, 114, 40, 45, 69, 186, 154, 185, 37, 19, 154, 20},
+ // 009909 但是因为还没到退休年龄,只能掰着指头捱日子。
+ {40, 17, 177, 116, 120, 214, 71, 8, 154, 47, 40, 30, 182, 214, 260, 140, 155, 83, 153, 126, 180, 262, 115, 155, 57, 37, 7, 262, 45, 262, 115, 182, 171, 8, 175, 116, 261, 112},
+ // 009910 这几天雨水不断,人们恨不得待在家里不出门。
+ {262, 44, 151, 74, 182, 82, 240, 177, 213, 37, 184, 40, 202, 180, 175, 52, 154, 55, 71, 54, 37, 186, 40, 42, 40, 7, 261, 10, 151, 77, 153, 74, 37, 186, 39, 183, 154, 52}
+
+ };
+
+ @Override
+ public void onClick(View v) {
+ switch (v.getId()) {
+ case R.id.btn_play:
+ if (!mediaPlayer.isPlaying()) {
+ mediaPlayer.start();
+ }
+ break;
+ case R.id.btn_pause:
+ if (mediaPlayer.isPlaying()) {
+ mediaPlayer.pause();
+ }
+ break;
+ case R.id.btn_stop:
+ if (mediaPlayer.isPlaying()) {
+ mediaPlayer.reset();
+ initMediaPlayer();
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ private void initMediaPlayer() {
+ try {
+ File file = new File(wavFile);
+ // 指定音频文件的路径
+ mediaPlayer.setDataSource(file.getPath());
+ // 让 MediaPlayer 进入到准备状态
+ mediaPlayer.prepare();
+ // 该方法使得进入应用时就播放音频
+ // mediaPlayer.setOnPreparedListener(this);
+ // prepare async to not block main thread
+ mediaPlayer.prepareAsync();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+
+ @Override
+ public void onPrepared(MediaPlayer player) {
+ player.start();
+ }
+
+ @Override
+ public boolean onError(MediaPlayer mp, int what, int extra) {
+ // The MediaPlayer has moved to the Error state, must be reset!
+ mediaPlayer.reset();
+ initMediaPlayer();
+ return true;
+ }
+
+ @Override
+ protected void onCreate(Bundle savedInstanceState) {
+ requestAllPermissions();
+ super.onCreate(savedInstanceState);
+ setContentView(R.layout.activity_main);
+
+ // 初始化控件
+ Spinner spinner = findViewById(R.id.spinner1);
+ // 建立数据源
+ String[] sentences = getResources().getStringArray(R.array.text);
+ // 建立 Adapter 并且绑定数据源
+ ArrayAdapter
About 1.85 billion n-grams;
'trie' binary with '-a 22 -q 8 -b 8'
[Mandarin LM Small](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm) | Baidu Internal Corpus | Char-based | 2.8 GB | Pruned with 0 1 2 4 4;
About 0.13 billion n-grams;
'probing' binary with default settings
[Mandarin LM Large](https://deepspeech.bj.bcebos.com/zh_lm/zhidao_giga.klm) | Baidu Internal Corpus | Char-based | 70.4 GB | No Pruning;
About 3.7 billion n-grams;
'probing' binary with default settings
diff --git a/examples/csmsc/tts3_rhy/README.md b/examples/csmsc/tts3_rhy/README.md
new file mode 100644
index 000000000..855aa885c
--- /dev/null
+++ b/examples/csmsc/tts3_rhy/README.md
@@ -0,0 +1,74 @@
+# This example mainly follows the FastSpeech2 with CSMSC
+This example contains code used to train a rhythm version of [Fastspeech2](https://arxiv.org/abs/2006.04558) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for fastspeech2.
+You can directly download the rhythm version of MFA result from here [baker_alignment_tone.zip](https://paddlespeech.bj.bcebos.com/Rhy_e2e/baker_alignment_tone.zip), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+Remember in our repo, you should add `--rhy-with-duration` flag to obtain the rhythm information.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+ - synthesize waveform from `metadata.jsonl`.
+ - synthesize waveform from a text file.
+5. inference using the static model.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│ ├── norm
+│ └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│ ├── norm
+│ └── raw
+└── train
+ ├── energy_stats.npy
+ ├── norm
+ ├── pitch_stats.npy
+ ├── raw
+ └── speech_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, the path of energy features, speaker, and the id of each utterance.
+
+# For more details, You can refer to [FastSpeech2 with CSMSC](../tts3)
+
+## Pretrained Model
+Pretrained FastSpeech2 model for end-to-end rhythm version:
+- [fastspeech2_rhy_csmsc_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_rhy_csmsc_ckpt_1.3.0.zip)
+
+This FastSpeech2 checkpoint contains files listed below.
+```text
+fastspeech2_rhy_csmsc_ckpt_1.3.0
+├── default.yaml # default config used to train fastspeech2
+├── phone_id_map.txt # phone vocabulary file when training fastspeech2
+├── snapshot_iter_153000.pdz # model parameters and optimizer states
+├── durations.txt # the intermediate output of preprocess.sh
+├── energy_stats.npy
+├── pitch_stats.npy
+└── speech_stats.npy # statistics used to normalize spectrogram when training fastspeech2
+```
diff --git a/examples/csmsc/tts3_rhy/conf/default.yaml b/examples/csmsc/tts3_rhy/conf/default.yaml
new file mode 120000
index 000000000..3f69c4ebb
--- /dev/null
+++ b/examples/csmsc/tts3_rhy/conf/default.yaml
@@ -0,0 +1 @@
+../../tts3/conf/default.yaml
\ No newline at end of file
diff --git a/examples/csmsc/tts3_rhy/local/preprocess.sh b/examples/csmsc/tts3_rhy/local/preprocess.sh
new file mode 120000
index 000000000..f4d0955e6
--- /dev/null
+++ b/examples/csmsc/tts3_rhy/local/preprocess.sh
@@ -0,0 +1 @@
+../../tts3/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/tts3_rhy/local/synthesize.sh b/examples/csmsc/tts3_rhy/local/synthesize.sh
new file mode 120000
index 000000000..f36d41990
--- /dev/null
+++ b/examples/csmsc/tts3_rhy/local/synthesize.sh
@@ -0,0 +1 @@
+../../tts3/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
new file mode 100755
index 000000000..8f5d80104
--- /dev/null
+++ b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --use_rhy=True
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=mb_melgan_csmsc \
+ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --use_rhy=True
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=style_melgan_csmsc \
+ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --use_rhy=True
+ # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "in hifigan syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=hifigan_csmsc \
+ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --use_rhy=True
+fi
+
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ echo "in wavernn syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=wavernn_csmsc \
+ --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+ --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --phones_dict=dump/phone_id_map.txt \
+ --inference_dir=${train_output_path}/inference \
+ --use_rhy=True
+fi
diff --git a/examples/csmsc/tts3_rhy/local/train.sh b/examples/csmsc/tts3_rhy/local/train.sh
new file mode 120000
index 000000000..11a597e85
--- /dev/null
+++ b/examples/csmsc/tts3_rhy/local/train.sh
@@ -0,0 +1 @@
+../../tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/tts3_rhy/path.sh b/examples/csmsc/tts3_rhy/path.sh
new file mode 120000
index 000000000..394bed7e7
--- /dev/null
+++ b/examples/csmsc/tts3_rhy/path.sh
@@ -0,0 +1 @@
+../tts3/path.sh
\ No newline at end of file
diff --git a/examples/csmsc/tts3_rhy/run.sh b/examples/csmsc/tts3_rhy/run.sh
new file mode 100755
index 000000000..e49f43ee6
--- /dev/null
+++ b/examples/csmsc/tts3_rhy/run.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # prepare data
+ ### please place the mfa result of rhythm here
+ ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # synthesize, vocoder is pwgan by default
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # synthesize_e2e, vocoder is pwgan by default
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/paddlespeech/cli/ssl/infer.py b/paddlespeech/cli/ssl/infer.py
index 154c25f53..dce7c7781 100644
--- a/paddlespeech/cli/ssl/infer.py
+++ b/paddlespeech/cli/ssl/infer.py
@@ -25,6 +25,7 @@ import librosa
import numpy as np
import paddle
import soundfile
+from paddlenlp.transformers import AutoTokenizer
from yacs.config import CfgNode
from ..executor import BaseExecutor
@@ -50,7 +51,7 @@ class SSLExecutor(BaseExecutor):
self.parser.add_argument(
'--model',
type=str,
- default='wav2vec2ASR_librispeech',
+ default=None,
choices=[
tag[:tag.index('-')]
for tag in self.task_resource.pretrained_models.keys()
@@ -123,7 +124,7 @@ class SSLExecutor(BaseExecutor):
help='Increase logger verbosity of current task.')
def _init_from_path(self,
- model_type: str='wav2vec2ASR_librispeech',
+ model_type: str=None,
task: str='asr',
lang: str='en',
sample_rate: int=16000,
@@ -134,6 +135,18 @@ class SSLExecutor(BaseExecutor):
Init model and other resources from a specific path.
"""
logger.debug("start to init the model")
+
+ if model_type is None:
+ if lang == 'en':
+ model_type = 'wav2vec2ASR_librispeech'
+ elif lang == 'zh':
+ model_type = 'wav2vec2ASR_aishell1'
+ else:
+ logger.error(
+ "invalid lang, please input --lang en or --lang zh")
+ logger.debug(
+ "Model type had not been specified, default {} was used.".
+ format(model_type))
# default max_len: unit:second
self.max_len = 50
if hasattr(self, 'model'):
@@ -167,9 +180,13 @@ class SSLExecutor(BaseExecutor):
self.config.merge_from_file(self.cfg_path)
if task == 'asr':
with UpdateConfig(self.config):
- self.text_feature = TextFeaturizer(
- unit_type=self.config.unit_type,
- vocab=self.config.vocab_filepath)
+ if lang == 'en':
+ self.text_feature = TextFeaturizer(
+ unit_type=self.config.unit_type,
+ vocab=self.config.vocab_filepath)
+ elif lang == 'zh':
+ self.text_feature = AutoTokenizer.from_pretrained(
+ self.config.tokenizer)
self.config.decode.decoding_method = decode_method
model_name = model_type[:model_type.rindex(
'_')] # model_type: {model_name}_{dataset}
@@ -253,7 +270,8 @@ class SSLExecutor(BaseExecutor):
audio,
text_feature=self.text_feature,
decoding_method=cfg.decoding_method,
- beam_size=cfg.beam_size)
+ beam_size=cfg.beam_size,
+ tokenizer=getattr(self.config, 'tokenizer', None))
self._outputs["result"] = result_transcripts[0][0]
except Exception as e:
logger.exception(e)
@@ -413,7 +431,7 @@ class SSLExecutor(BaseExecutor):
@stats_wrapper
def __call__(self,
audio_file: os.PathLike,
- model: str='wav2vec2ASR_librispeech',
+ model: str=None,
task: str='asr',
lang: str='en',
sample_rate: int=16000,
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 067246749..3c5aa1f90 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -70,6 +70,38 @@ ssl_dynamic_pretrained_models = {
'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
},
},
+ "wav2vec2-zh-16k": {
+ '1.3': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2-large-wenetspeech-self_ckpt_1.3.0.model.tar.gz',
+ 'md5':
+ '00ea4975c05d1bb58181205674052fe1',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'chinese-wav2vec2-large',
+ 'model':
+ 'chinese-wav2vec2-large.pdparams',
+ 'params':
+ 'chinese-wav2vec2-large.pdparams',
+ },
+ },
+ "wav2vec2ASR_aishell1-zh-16k": {
+ '1.3': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz',
+ 'md5':
+ 'ac8fa0a6345e6a7535f6fabb5e59e218',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/wav2vec2ASR/checkpoints/avg_1',
+ 'model':
+ 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
+ 'params':
+ 'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
+ },
+ },
}
# ---------------------------------
@@ -1658,3 +1690,16 @@ g2pw_onnx_models = {
},
},
}
+
+# ---------------------------------
+# ------------- Rhy_frontend ---------------
+# ---------------------------------
+rhy_frontend_models = {
+ 'rhy_e2e': {
+ '1.0': {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Rhy_e2e/rhy_frontend.zip',
+ 'md5': '6624a77393de5925d5a84400b363d8ef',
+ },
+ },
+}
diff --git a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
index 5670cb531..688bf5f84 100644
--- a/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
+++ b/paddlespeech/s2t/models/wav2vec2/modules/modeling_wav2vec2.py
@@ -1173,10 +1173,6 @@ class Wav2Vec2ConfigPure():
self.proj_codevector_dim = config.proj_codevector_dim
self.diversity_loss_weight = config.diversity_loss_weight
- # ctc loss
- self.ctc_loss_reduction = config.ctc_loss_reduction
- self.ctc_zero_infinity = config.ctc_zero_infinity
-
# adapter
self.add_adapter = config.add_adapter
self.adapter_kernel_size = config.adapter_kernel_size
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
index eda188da5..dc6c6d1d3 100644
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -76,28 +76,66 @@ class Wav2vec2ASR(nn.Layer):
feats: paddle.Tensor,
text_feature: Dict[str, int],
decoding_method: str,
- beam_size: int):
+ beam_size: int,
+ tokenizer: str=None):
batch_size = feats.shape[0]
if decoding_method == 'ctc_prefix_beam_search' and batch_size > 1:
- logger.error(
- f'decoding mode {decoding_method} must be running with batch_size == 1'
+ raise ValueError(
+ f"decoding mode {decoding_method} must be running with batch_size == 1"
)
- logger.error(f"current batch_size is {batch_size}")
- sys.exit(1)
if decoding_method == 'ctc_greedy_search':
- hyps = self.ctc_greedy_search(feats)
- res = [text_feature.defeaturize(hyp) for hyp in hyps]
- res_tokenids = [hyp for hyp in hyps]
+ if tokenizer is None:
+ hyps = self.ctc_greedy_search(feats)
+ res = [text_feature.defeaturize(hyp) for hyp in hyps]
+ res_tokenids = [hyp for hyp in hyps]
+ else:
+ hyps = self.ctc_greedy_search(feats)
+ res = []
+ res_tokenids = []
+ for sequence in hyps:
+ # Decode token terms to words
+ predicted_tokens = text_feature.convert_ids_to_tokens(
+ sequence)
+ tmp_res = []
+ tmp_res_tokenids = []
+ for c in predicted_tokens:
+ if c == "[CLS]":
+ continue
+ elif c == "[SEP]" or c == "[PAD]":
+ break
+ else:
+ tmp_res.append(c)
+ tmp_res_tokenids.append(text_feature.vocab[c])
+ res.append(''.join(tmp_res))
+ res_tokenids.append(tmp_res_tokenids)
# ctc_prefix_beam_search and attention_rescoring only return one
# result in List[int], change it to List[List[int]] for compatible
# with other batch decoding mode
elif decoding_method == 'ctc_prefix_beam_search':
assert feats.shape[0] == 1
- hyp = self.ctc_prefix_beam_search(feats, beam_size)
- res = [text_feature.defeaturize(hyp)]
- res_tokenids = [hyp]
+ if tokenizer is None:
+ hyp = self.ctc_prefix_beam_search(feats, beam_size)
+ res = [text_feature.defeaturize(hyp)]
+ res_tokenids = [hyp]
+ else:
+ hyp = self.ctc_prefix_beam_search(feats, beam_size)
+ res = []
+ res_tokenids = []
+ predicted_tokens = text_feature.convert_ids_to_tokens(hyp)
+ tmp_res = []
+ tmp_res_tokenids = []
+ for c in predicted_tokens:
+ if c == "[CLS]":
+ continue
+ elif c == "[SEP]" or c == "[PAD]":
+ break
+ else:
+ tmp_res.append(c)
+ tmp_res_tokenids.append(text_feature.vocab[c])
+ res.append(''.join(tmp_res))
+ res_tokenids.append(tmp_res_tokenids)
else:
raise ValueError(
f"wav2vec2 not support decoding method: {decoding_method}")
diff --git a/paddlespeech/t2s/exps/lite_predict.py b/paddlespeech/t2s/exps/lite_predict.py
index bd0c732b1..f19ae027c 100644
--- a/paddlespeech/t2s/exps/lite_predict.py
+++ b/paddlespeech/t2s/exps/lite_predict.py
@@ -17,10 +17,10 @@ from pathlib import Path
import soundfile as sf
from timer import timer
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_am_output
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_predictor
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_voc_output
from paddlespeech.t2s.exps.syn_utils import get_frontend
-from paddlespeech.t2s.exps.syn_utils import get_lite_am_output
-from paddlespeech.t2s.exps.syn_utils import get_lite_predictor
-from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output
from paddlespeech.t2s.exps.syn_utils import get_sentences
diff --git a/paddlespeech/t2s/exps/lite_predict_streaming.py b/paddlespeech/t2s/exps/lite_predict_streaming.py
index 37b600512..2bd78ed01 100644
--- a/paddlespeech/t2s/exps/lite_predict_streaming.py
+++ b/paddlespeech/t2s/exps/lite_predict_streaming.py
@@ -18,13 +18,13 @@ import numpy as np
import soundfile as sf
from timer import timer
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_am_sublayer_output
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_predictor
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_streaming_am_output
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_voc_output
from paddlespeech.t2s.exps.syn_utils import denorm
from paddlespeech.t2s.exps.syn_utils import get_chunks
from paddlespeech.t2s.exps.syn_utils import get_frontend
-from paddlespeech.t2s.exps.syn_utils import get_lite_am_sublayer_output
-from paddlespeech.t2s.exps.syn_utils import get_lite_predictor
-from paddlespeech.t2s.exps.syn_utils import get_lite_streaming_am_output
-from paddlespeech.t2s.exps.syn_utils import get_lite_voc_output
from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.exps.syn_utils import run_frontend
from paddlespeech.t2s.utils import str2bool
diff --git a/paddlespeech/t2s/exps/lite_syn_utils.py b/paddlespeech/t2s/exps/lite_syn_utils.py
new file mode 100644
index 000000000..2c67edae6
--- /dev/null
+++ b/paddlespeech/t2s/exps/lite_syn_utils.py
@@ -0,0 +1,111 @@
+import os
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+from paddlelite.lite import create_paddle_predictor
+from paddlelite.lite import MobileConfig
+
+from .syn_utils import run_frontend
+
+
+# Paddle-Lite
+def get_lite_predictor(model_dir: Optional[os.PathLike]=None,
+ model_file: Optional[os.PathLike]=None,
+ cpu_threads: int=1):
+ config = MobileConfig()
+ config.set_model_from_file(str(Path(model_dir) / model_file))
+ predictor = create_paddle_predictor(config)
+ return predictor
+
+
+def get_lite_am_output(
+ input: str,
+ am_predictor,
+ am: str,
+ frontend: object,
+ lang: str='zh',
+ merge_sentences: bool=True,
+ speaker_dict: Optional[os.PathLike]=None,
+ spk_id: int=0, ):
+ am_name = am[:am.rindex('_')]
+ am_dataset = am[am.rindex('_') + 1:]
+ get_spk_id = False
+ get_tone_ids = False
+ if am_name == 'speedyspeech':
+ get_tone_ids = True
+ if am_dataset in {"aishell3", "vctk", "mix"} and speaker_dict:
+ get_spk_id = True
+ spk_id = np.array([spk_id])
+
+ frontend_dict = run_frontend(
+ frontend=frontend,
+ text=input,
+ merge_sentences=merge_sentences,
+ get_tone_ids=get_tone_ids,
+ lang=lang)
+
+ if get_tone_ids:
+ tone_ids = frontend_dict['tone_ids']
+ tones = tone_ids[0].numpy()
+ tones_handle = am_predictor.get_input(1)
+ tones_handle.from_numpy(tones)
+
+ if get_spk_id:
+ spk_id_handle = am_predictor.get_input(1)
+ spk_id_handle.from_numpy(spk_id)
+ phone_ids = frontend_dict['phone_ids']
+ phones = phone_ids[0].numpy()
+ phones_handle = am_predictor.get_input(0)
+ phones_handle.from_numpy(phones)
+ am_predictor.run()
+ am_output_handle = am_predictor.get_output(0)
+ am_output_data = am_output_handle.numpy()
+ return am_output_data
+
+
+def get_lite_voc_output(voc_predictor, input):
+ mel_handle = voc_predictor.get_input(0)
+ mel_handle.from_numpy(input)
+ voc_predictor.run()
+ voc_output_handle = voc_predictor.get_output(0)
+ wav = voc_output_handle.numpy()
+ return wav
+
+
+def get_lite_am_sublayer_output(am_sublayer_predictor, input):
+ input_handle = am_sublayer_predictor.get_input(0)
+ input_handle.from_numpy(input)
+
+ am_sublayer_predictor.run()
+ am_sublayer_handle = am_sublayer_predictor.get_output(0)
+ am_sublayer_output = am_sublayer_handle.numpy()
+ return am_sublayer_output
+
+
+def get_lite_streaming_am_output(input: str,
+ am_encoder_infer_predictor,
+ am_decoder_predictor,
+ am_postnet_predictor,
+ frontend,
+ lang: str='zh',
+ merge_sentences: bool=True):
+ get_tone_ids = False
+ frontend_dict = run_frontend(
+ frontend=frontend,
+ text=input,
+ merge_sentences=merge_sentences,
+ get_tone_ids=get_tone_ids,
+ lang=lang)
+ phone_ids = frontend_dict['phone_ids']
+ phones = phone_ids[0].numpy()
+ am_encoder_infer_output = get_lite_am_sublayer_output(
+ am_encoder_infer_predictor, input=phones)
+ am_decoder_output = get_lite_am_sublayer_output(
+ am_decoder_predictor, input=am_encoder_infer_output)
+ am_postnet_output = get_lite_am_sublayer_output(
+ am_postnet_predictor, input=np.transpose(am_decoder_output, (0, 2, 1)))
+ am_output_data = am_decoder_output + np.transpose(am_postnet_output,
+ (0, 2, 1))
+ normalized_mel = am_output_data[0]
+ return normalized_mel
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index cea125291..82b718488 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -26,8 +26,6 @@ import paddle
from paddle import inference
from paddle import jit
from paddle.static import InputSpec
-from paddlelite.lite import create_paddle_predictor
-from paddlelite.lite import MobileConfig
from yacs.config import CfgNode
from paddlespeech.t2s.datasets.data_table import DataTable
@@ -163,10 +161,13 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
# frontend
def get_frontend(lang: str='zh',
phones_dict: Optional[os.PathLike]=None,
- tones_dict: Optional[os.PathLike]=None):
+ tones_dict: Optional[os.PathLike]=None,
+ use_rhy=False):
if lang == 'zh':
frontend = Frontend(
- phone_vocab_path=phones_dict, tone_vocab_path=tones_dict)
+ phone_vocab_path=phones_dict,
+ tone_vocab_path=tones_dict,
+ use_rhy=use_rhy)
elif lang == 'en':
frontend = English(phone_vocab_path=phones_dict)
elif lang == 'mix':
@@ -512,105 +513,3 @@ def get_sess(model_path: Optional[os.PathLike],
sess = ort.InferenceSession(
model_path, providers=providers, sess_options=sess_options)
return sess
-
-
-# Paddle-Lite
-def get_lite_predictor(model_dir: Optional[os.PathLike]=None,
- model_file: Optional[os.PathLike]=None,
- cpu_threads: int=1):
- config = MobileConfig()
- config.set_model_from_file(str(Path(model_dir) / model_file))
- predictor = create_paddle_predictor(config)
- return predictor
-
-
-def get_lite_am_output(
- input: str,
- am_predictor,
- am: str,
- frontend: object,
- lang: str='zh',
- merge_sentences: bool=True,
- speaker_dict: Optional[os.PathLike]=None,
- spk_id: int=0, ):
- am_name = am[:am.rindex('_')]
- am_dataset = am[am.rindex('_') + 1:]
- get_spk_id = False
- get_tone_ids = False
- if am_name == 'speedyspeech':
- get_tone_ids = True
- if am_dataset in {"aishell3", "vctk", "mix"} and speaker_dict:
- get_spk_id = True
- spk_id = np.array([spk_id])
-
- frontend_dict = run_frontend(
- frontend=frontend,
- text=input,
- merge_sentences=merge_sentences,
- get_tone_ids=get_tone_ids,
- lang=lang)
-
- if get_tone_ids:
- tone_ids = frontend_dict['tone_ids']
- tones = tone_ids[0].numpy()
- tones_handle = am_predictor.get_input(1)
- tones_handle.from_numpy(tones)
-
- if get_spk_id:
- spk_id_handle = am_predictor.get_input(1)
- spk_id_handle.from_numpy(spk_id)
- phone_ids = frontend_dict['phone_ids']
- phones = phone_ids[0].numpy()
- phones_handle = am_predictor.get_input(0)
- phones_handle.from_numpy(phones)
- am_predictor.run()
- am_output_handle = am_predictor.get_output(0)
- am_output_data = am_output_handle.numpy()
- return am_output_data
-
-
-def get_lite_voc_output(voc_predictor, input):
- mel_handle = voc_predictor.get_input(0)
- mel_handle.from_numpy(input)
- voc_predictor.run()
- voc_output_handle = voc_predictor.get_output(0)
- wav = voc_output_handle.numpy()
- return wav
-
-
-def get_lite_am_sublayer_output(am_sublayer_predictor, input):
- input_handle = am_sublayer_predictor.get_input(0)
- input_handle.from_numpy(input)
-
- am_sublayer_predictor.run()
- am_sublayer_handle = am_sublayer_predictor.get_output(0)
- am_sublayer_output = am_sublayer_handle.numpy()
- return am_sublayer_output
-
-
-def get_lite_streaming_am_output(input: str,
- am_encoder_infer_predictor,
- am_decoder_predictor,
- am_postnet_predictor,
- frontend,
- lang: str='zh',
- merge_sentences: bool=True):
- get_tone_ids = False
- frontend_dict = run_frontend(
- frontend=frontend,
- text=input,
- merge_sentences=merge_sentences,
- get_tone_ids=get_tone_ids,
- lang=lang)
- phone_ids = frontend_dict['phone_ids']
- phones = phone_ids[0].numpy()
- am_encoder_infer_output = get_lite_am_sublayer_output(
- am_encoder_infer_predictor, input=phones)
- am_decoder_output = get_lite_am_sublayer_output(
- am_decoder_predictor, input=am_encoder_infer_output)
- am_postnet_output = get_lite_am_sublayer_output(
- am_postnet_predictor, input=np.transpose(am_decoder_output, (0, 2, 1)))
- am_output_data = am_decoder_output + np.transpose(am_postnet_output,
- (0, 2, 1))
- normalized_mel = am_output_data[0]
- return normalized_mel
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 9ce8286fb..625002477 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -27,6 +27,7 @@ from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.exps.syn_utils import get_voc_inference
from paddlespeech.t2s.exps.syn_utils import run_frontend
from paddlespeech.t2s.exps.syn_utils import voc_to_static
+from paddlespeech.t2s.utils import str2bool
def evaluate(args):
@@ -49,7 +50,8 @@ def evaluate(args):
frontend = get_frontend(
lang=args.lang,
phones_dict=args.phones_dict,
- tones_dict=args.tones_dict)
+ tones_dict=args.tones_dict,
+ use_rhy=args.use_rhy)
print("frontend done!")
# acoustic model
@@ -240,6 +242,11 @@ def parse_args():
type=str,
help="text to synthesize, a 'utt_id sentence' pair per line.")
parser.add_argument("--output_dir", type=str, help="output dir.")
+ parser.add_argument(
+ "--use_rhy",
+ type=str2bool,
+ default=False,
+ help="run rhythm frontend or not")
args = parser.parse_args()
return args
diff --git a/paddlespeech/t2s/frontend/rhy_prediction/__init__.py b/paddlespeech/t2s/frontend/rhy_prediction/__init__.py
new file mode 100644
index 000000000..62e98f805
--- /dev/null
+++ b/paddlespeech/t2s/frontend/rhy_prediction/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .rhy_predictor import *
diff --git a/paddlespeech/t2s/frontend/rhy_prediction/rhy_predictor.py b/paddlespeech/t2s/frontend/rhy_prediction/rhy_predictor.py
new file mode 100644
index 000000000..a2a6b8a69
--- /dev/null
+++ b/paddlespeech/t2s/frontend/rhy_prediction/rhy_predictor.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+
+import paddle
+import yaml
+from paddlenlp.transformers import ErnieTokenizer
+from yacs.config import CfgNode
+
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.resource.pretrained_models import rhy_frontend_models
+from paddlespeech.text.models.ernie_linear import ErnieLinear
+from paddlespeech.utils.env import MODEL_HOME
+
+DefinedClassifier = {
+ 'ErnieLinear': ErnieLinear,
+}
+
+model_version = '1.0'
+
+
+class RhyPredictor():
+ def __init__(
+ self,
+ model_dir: os.PathLike=MODEL_HOME, ):
+ uncompress_path = download_and_decompress(
+ rhy_frontend_models['rhy_e2e'][model_version], model_dir)
+ with open(os.path.join(uncompress_path, 'rhy_default.yaml')) as f:
+ config = CfgNode(yaml.safe_load(f))
+ self.punc_list = []
+ with open(os.path.join(uncompress_path, 'rhy_token'), 'r') as f:
+ for line in f:
+ self.punc_list.append(line.strip())
+ self.punc_list = [0] + self.punc_list
+ self.make_rhy_dict()
+ self.model = DefinedClassifier["ErnieLinear"](**config["model"])
+ pretrained_token = config['data_params']['pretrained_token']
+ self.tokenizer = ErnieTokenizer.from_pretrained(pretrained_token)
+ state_dict = paddle.load(
+ os.path.join(uncompress_path, 'snapshot_iter_2600_main_params.pdz'))
+ self.model.set_state_dict(state_dict)
+ self.model.eval()
+
+ def _clean_text(self, text):
+ text = text.lower()
+ text = re.sub('[^A-Za-z0-9\u4e00-\u9fa5]', '', text)
+ text = re.sub(f'[{"".join([p for p in self.punc_list][1:])}]', '', text)
+ return text
+
+ def preprocess(self, text, tokenizer):
+ clean_text = self._clean_text(text)
+ assert len(clean_text) > 0, f'Invalid input string: {text}'
+ tokenized_input = tokenizer(
+ list(clean_text), return_length=True, is_split_into_words=True)
+ _inputs = dict()
+ _inputs['input_ids'] = tokenized_input['input_ids']
+ _inputs['seg_ids'] = tokenized_input['token_type_ids']
+ _inputs['seq_len'] = tokenized_input['seq_len']
+ return _inputs
+
+ def get_prediction(self, raw_text):
+ _inputs = self.preprocess(raw_text, self.tokenizer)
+ seq_len = _inputs['seq_len']
+ input_ids = paddle.to_tensor(_inputs['input_ids']).unsqueeze(0)
+ seg_ids = paddle.to_tensor(_inputs['seg_ids']).unsqueeze(0)
+ logits, _ = self.model(input_ids, seg_ids)
+ preds = paddle.argmax(logits, axis=-1).squeeze(0)
+ tokens = self.tokenizer.convert_ids_to_tokens(
+ _inputs['input_ids'][1:seq_len - 1])
+ labels = preds[1:seq_len - 1].tolist()
+ assert len(tokens) == len(labels)
+ # add 0 for non punc
+ text = ''
+ for t, l in zip(tokens, labels):
+ text += t
+ if l != 0: # Non punc.
+ text += self.punc_list[l]
+ return text
+
+ def make_rhy_dict(self):
+ self.rhy_dict = {}
+ for i, p in enumerate(self.punc_list[1:]):
+ self.rhy_dict[p] = 'sp' + str(i + 1)
+
+ def pinyin_align(self, pinyins, rhy_pre):
+ final_py = []
+ j = 0
+ for i in range(len(rhy_pre)):
+ if rhy_pre[i] in self.rhy_dict:
+ final_py.append(self.rhy_dict[rhy_pre[i]])
+ else:
+ final_py.append(pinyins[j])
+ j += 1
+ return final_py
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index e30286986..ddd8cf5c7 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -30,6 +30,7 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin
from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
+from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor
from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
@@ -82,11 +83,13 @@ class Frontend():
def __init__(self,
g2p_model="g2pW",
phone_vocab_path=None,
- tone_vocab_path=None):
+ tone_vocab_path=None,
+ use_rhy=False):
self.mix_ssml_processor = MixTextProcessor()
self.tone_modifier = ToneSandhi()
self.text_normalizer = TextNormalizer()
self.punc = ":,;。?!“”‘’':,;.?!"
+ self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4']
self.phrases_dict = {
'开户行': [['ka1i'], ['hu4'], ['hang2']],
'发卡行': [['fa4'], ['ka3'], ['hang2']],
@@ -105,6 +108,10 @@ class Frontend():
'嘞': [['lei5']],
'掺和': [['chan1'], ['huo5']]
}
+ self.use_rhy = use_rhy
+ if use_rhy:
+ self.rhy_predictor = RhyPredictor()
+ print("Rhythm predictor loaded.")
# g2p_model can be pypinyin and g2pM and g2pW
self.g2p_model = g2p_model
if self.g2p_model == "g2pM":
@@ -195,9 +202,13 @@ class Frontend():
segments = sentences
phones_list = []
for seg in segments:
+ if self.use_rhy:
+ seg = self.rhy_predictor._clean_text(seg)
phones = []
# Replace all English words in the sentence
seg = re.sub('[a-zA-Z]+', '', seg)
+ if self.use_rhy:
+ seg = self.rhy_predictor.get_prediction(seg)
seg_cut = psg.lcut(seg)
initials = []
finals = []
@@ -205,11 +216,18 @@ class Frontend():
# 为了多音词获得更好的效果,这里采用整句预测
if self.g2p_model == "g2pW":
try:
+ if self.use_rhy:
+ seg = self.rhy_predictor._clean_text(seg)
pinyins = self.g2pW_model(seg)[0]
except Exception:
# g2pW采用模型采用繁体输入,如果有cover不了的简体词,采用g2pM预测
print("[%s] not in g2pW dict,use g2pM" % seg)
pinyins = self.g2pM_model(seg, tone=True, char_split=False)
+ if self.use_rhy:
+ rhy_text = self.rhy_predictor.get_prediction(seg)
+ final_py = self.rhy_predictor.pinyin_align(pinyins,
+ rhy_text)
+ pinyins = final_py
pre_word_length = 0
for word, pos in seg_cut:
sub_initials = []
@@ -271,7 +289,7 @@ class Frontend():
phones.append(c)
if c and c in self.punc:
phones.append('sp')
- if v and v not in self.punc:
+ if v and v not in self.punc and v not in self.rhy_phns:
phones.append(v)
phones_list.append(phones)
if merge_sentences:
@@ -330,7 +348,7 @@ class Frontend():
phones.append(c)
if c and c in self.punc:
phones.append('sp')
- if v and v not in self.punc:
+ if v and v not in self.punc and v not in self.rhy_phns:
phones.append(v)
phones_list.append(phones)
if merge_sentences:
@@ -504,6 +522,11 @@ class Frontend():
print("----------------------------")
return [sum(all_phonemes, [])]
+ def add_sp_if_no(self, phonemes):
+ if not phonemes[-1][-1].startswith('sp'):
+ phonemes[-1].append('sp4')
+ return phonemes
+
def get_input_ids(self,
sentence: str,
merge_sentences: bool=True,
@@ -519,6 +542,8 @@ class Frontend():
merge_sentences=merge_sentences,
print_info=print_info,
robot=robot)
+ if self.use_rhy:
+ phonemes = self.add_sp_if_no(phonemes)
result = {}
phones = []
tones = []
diff --git a/setup.py b/setup.py
index 7fb4c70be..5ed216f33 100644
--- a/setup.py
+++ b/setup.py
@@ -47,7 +47,7 @@ base = [
"onnxruntime==1.10.0",
"opencc",
"pandas",
- "paddlenlp",
+ "paddlenlp>=2.4.3",
"paddlespeech_feat",
"Pillow>=9.0.0",
"praatio==5.0.0",
@@ -71,11 +71,10 @@ base = [
"prettytable",
"zhon",
"colorlog",
- "pathos == 0.2.8",
+ "pathos==0.2.8",
"braceexpand",
"pyyaml",
"pybind11",
- "paddlelite",
"paddleslim==2.3.4",
]
diff --git a/speechx/README.md b/speechx/README.md
index a575040db..5d4b5845f 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -22,7 +22,7 @@ We develop under:
1. First to launch docker container.
```
-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7 /bin/bash
+docker run --privileged --net=host --ipc=host -it --rm -v /path/to/paddlespeech:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7 /bin/bash
```
* More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
diff --git a/speechx/examples/custom_asr/README.md b/speechx/examples/custom_asr/README.md
index 5ffa21b50..33cf4ff03 100644
--- a/speechx/examples/custom_asr/README.md
+++ b/speechx/examples/custom_asr/README.md
@@ -1,4 +1,4 @@
-# customized Auto Speech Recognition
+# Customized ASR
## introduction
These scripts are tutorials to show you how build your own decoding graph.
diff --git a/speechx/examples/ds2_ol/README.md b/speechx/examples/ds2_ol/README.md
index 492d0e1ac..d1da96cc9 100644
--- a/speechx/examples/ds2_ol/README.md
+++ b/speechx/examples/ds2_ol/README.md
@@ -4,3 +4,4 @@
* `websocket` - Streaming ASR with websocket for deepspeech2_aishell.
* `aishell` - Streaming Decoding under aishell dataset, for local WER test.
+* `onnx` - Example to convert deepspeech2 to onnx format.
diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md
index 3e7af9244..2ee0bbca9 100644
--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@@ -1,12 +1,57 @@
# Aishell - Deepspeech2 Streaming
-## How to run
+> We recommend using U2/U2++ model instead of DS2, please see [here](../../u2pp_ol/wenetspeech/).
+A C++ deployment example for using the deepspeech2 model to recognize `wav` and compute `CER`. We using AISHELL-1 as test data.
+
+## Source path.sh
+
+```bash
+. path.sh
```
+
+SpeechX bins is under `echo $SPEECHX_BUILD`, more info please see `path.sh`.
+
+## Recognize with linear feature
+
+```bash
bash run.sh
```
-## Results
+`run.sh` has multi stage, for details please see `run.sh`:
+
+1. donwload dataset, model and lm
+2. convert cmvn format and compute feature
+3. decode w/o lm by feature
+4. decode w/ ngram lm by feature
+5. decode w/ TLG graph by feature
+6. recognize w/ TLG graph by wav input
+
+### Recognize with `.scp` file for wav
+
+This sciprt using `recognizer_main` to recognize wav file.
+
+The input is `scp` file which look like this:
+```text
+# head data/split1/1/aishell_test.scp
+BAC009S0764W0121 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0121.wav
+BAC009S0764W0122 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0122.wav
+...
+BAC009S0764W0125 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0125.wav
+```
+
+If you want to recognize one wav, you can make `scp` file like this:
+```text
+key path/to/wav/file
+```
+
+Then specify `--wav_rspecifier=` param for `recognizer_main` bin. For other flags meaning, please see `help`:
+```bash
+recognizer_main --help
+```
+
+For the exmaple to using `recognizer_main` please see `run.sh`.
+
### CTC Prefix Beam Search w/o LM
@@ -25,7 +70,7 @@ Mandarin -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
```
-### CTC WFST
+### CTC TLG WFST
LM: [aishell train](http://paddlespeech.bj.bcebos.com/speechx/examples/ds2_ol/aishell/aishell_graph.zip)
--acoustic_scale=1.2
@@ -43,8 +88,11 @@ Mandarin -> 10.93 % N=104762 C=93410 S=9779 D=1573 I=95
Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
```
-## fbank
-```
+## Recognize with fbank feature
+
+This script is same to `run.sh`, but using fbank feature.
+
+```bash
bash run_fbank.sh
```
@@ -66,7 +114,7 @@ Mandarin -> 5.82 % N=104762 C=99386 S=4941 D=435 I=720
English -> 0.00 % N=0 C=0 S=0 D=0 I=0
```
-### CTC WFST
+### CTC TLG WFST
LM: [aishell train](https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph2.zip)
```
@@ -75,7 +123,11 @@ Mandarin -> 9.57 % N=104762 C=94817 S=4325 D=5620 I=84
Other -> 100.00 % N=3 C=0 S=1 D=2 I=0
```
-## build TLG graph
-```
- bash run_build_tlg.sh
+## Build TLG WFST graph
+
+The script is for building TLG wfst graph, depending on `srilm`, please make sure it is installed.
+For more information please see the script below.
+
+```bash
+ bash ./local/run_build_tlg.sh
```
diff --git a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh b/speechx/examples/ds2_ol/aishell/local/run_build_tlg.sh
similarity index 95%
rename from speechx/examples/ds2_ol/aishell/run_build_tlg.sh
rename to speechx/examples/ds2_ol/aishell/local/run_build_tlg.sh
index 2e148657b..07f47c7ea 100755
--- a/speechx/examples/ds2_ol/aishell/run_build_tlg.sh
+++ b/speechx/examples/ds2_ol/aishell/local/run_build_tlg.sh
@@ -22,6 +22,7 @@ mkdir -p $data
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
if [ ! -f $data/speech.ngram.zh.tar.gz ];then
+ # download ngram
pushd $data
wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
tar xvzf speech.ngram.zh.tar.gz
@@ -29,6 +30,7 @@ if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
fi
if [ ! -f $ckpt_dir/data/mean_std.json ]; then
+ # download model
mkdir -p $ckpt_dir
pushd $ckpt_dir
wget -c https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/WIP1_asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz
@@ -43,6 +45,7 @@ if [ ! -f $unit ]; then
fi
if ! which ngram-count; then
+ # need srilm install
pushd $MAIN_ROOT/tools
make srilm.done
popd
@@ -71,7 +74,7 @@ lm=data/local/lm
mkdir -p $lm
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # Train lm
+ # Train ngram lm
cp $text $lm/text
local/aishell_train_lms.sh
echo "build LM done."
@@ -94,8 +97,8 @@ cmvn=$data/cmvn_fbank.ark
wfst=$data/lang_test
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-
if [ ! -d $data/test ]; then
+ # download test dataset
pushd $data
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
unzip aishell_test.zip
@@ -107,7 +110,8 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
fi
./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
-
+
+ # convert cmvn format
cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
fi
@@ -116,7 +120,7 @@ label_file=aishell_result
export GLOG_logtostderr=1
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
- # TLG decoder
+ # recognize w/ TLG graph
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/check_tlg.log \
recognizer_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh
index 794b533ff..49438cb25 100755
--- a/speechx/examples/ds2_ol/aishell/run.sh
+++ b/speechx/examples/ds2_ol/aishell/run.sh
@@ -32,6 +32,7 @@ exp=$PWD/exp
aishell_wav_scp=aishell_test.scp
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
if [ ! -d $data/test ]; then
+ # donwload dataset
pushd $data
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
unzip aishell_test.zip
@@ -43,6 +44,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
fi
if [ ! -f $ckpt_dir/data/mean_std.json ]; then
+ # download model
mkdir -p $ckpt_dir
pushd $ckpt_dir
wget -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
@@ -52,6 +54,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ];then
lm=$data/zh_giga.no_cna_cmn.prune01244.klm
if [ ! -f $lm ]; then
+ # download kenlm bin
pushd $data
wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
popd
@@ -68,7 +71,7 @@ export GLOG_logtostderr=1
cmvn=$data/cmvn.ark
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # 3. gen linear feat
+ # 3. convert cmvn format and compute linear feat
cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
@@ -82,14 +85,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # recognizer
+ # decode w/o lm
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
ctc_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
- --nnet_decoder_chunk=8 \
+ --nnet_decoder_chunk=8 \
--dict_file=$vocb_dir/vocab.txt \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result
@@ -101,14 +104,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # decode with lm
+ # decode w/ ngram lm with feature input
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
ctc_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
- --nnet_decoder_chunk=8 \
+ --nnet_decoder_chunk=8 \
--dict_file=$vocb_dir/vocab.txt \
--lm_path=$lm \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
@@ -124,6 +127,7 @@ wfst=$data/wfst/
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
mkdir -p $wfst
if [ ! -f $wfst/aishell_graph.zip ]; then
+ # download TLG graph
pushd $wfst
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
unzip aishell_graph.zip
@@ -133,7 +137,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
- # TLG decoder
+ # decoder w/ TLG graph with feature input
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
ctc_tlg_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
@@ -142,7 +146,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--graph_path=$wfst/TLG.fst --max_active=7500 \
- --nnet_decoder_chunk=8 \
+ --nnet_decoder_chunk=8 \
--acoustic_scale=1.2 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
@@ -154,7 +158,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
- # TLG decoder
+ # recognize from wav file w/ TLG graph
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recognizer.log \
recognizer_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
@@ -162,7 +166,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \
- --nnet_decoder_chunk=8 \
+ --nnet_decoder_chunk=8 \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--graph_path=$wfst/TLG.fst --max_active=7500 \
--acoustic_scale=1.2 \
@@ -173,4 +177,4 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
echo "recognizer test have finished!!!"
echo "please checkout in ${exp}/${wer}.recognizer"
tail -n 7 $exp/${wer}.recognizer
-fi
+fi
\ No newline at end of file
diff --git a/speechx/examples/ds2_ol/aishell/run_fbank.sh b/speechx/examples/ds2_ol/aishell/run_fbank.sh
index 1c3c3e010..b93d6944d 100755
--- a/speechx/examples/ds2_ol/aishell/run_fbank.sh
+++ b/speechx/examples/ds2_ol/aishell/run_fbank.sh
@@ -68,7 +68,7 @@ export GLOG_logtostderr=1
cmvn=$data/cmvn_fbank.ark
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
- # 3. gen linear feat
+ # 3. convert cmvn format and compute fbank feat
cmvn_json2kaldi_main --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn --binary=false
./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
@@ -82,7 +82,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
- # recognizer
+ # decode w/ lm by feature
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wolm.log \
ctc_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
@@ -90,7 +90,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--param_path=$model_dir/avg_5.jit.pdiparams \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--model_cache_shapes="5-1-2048,5-1-2048" \
- --nnet_decoder_chunk=8 \
+ --nnet_decoder_chunk=8 \
--dict_file=$vocb_dir/vocab.txt \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_fbank
@@ -100,15 +100,15 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
- # decode with lm
+ # decode with ngram lm by feature
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.lm.log \
ctc_beam_search_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
--model_path=$model_dir/avg_5.jit.pdmodel \
--param_path=$model_dir/avg_5.jit.pdiparams \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
- --model_cache_shapes="5-1-2048,5-1-2048" \
- --nnet_decoder_chunk=8 \
+ --model_cache_shapes="5-1-2048,5-1-2048" \
+ --nnet_decoder_chunk=8 \
--dict_file=$vocb_dir/vocab.txt \
--lm_path=$lm \
--result_wspecifier=ark,t:$data/split${nj}/JOB/fbank_result_lm
@@ -131,7 +131,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
fi
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
- # TLG decoder
+ # decode w/ TLG graph by feature
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.fbank.wfst.log \
ctc_tlg_decoder_main \
--feature_rspecifier=scp:$data/split${nj}/JOB/fbank_feat.scp \
@@ -139,8 +139,8 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
--param_path=$model_dir/avg_5.jit.pdiparams \
--word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
- --model_cache_shapes="5-1-2048,5-1-2048" \
- --nnet_decoder_chunk=8 \
+ --model_cache_shapes="5-1-2048,5-1-2048" \
+ --nnet_decoder_chunk=8 \
--graph_path=$wfst/TLG.fst --max_active=7500 \
--acoustic_scale=1.2 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
@@ -153,6 +153,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
fi
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+ # recgonize w/ TLG graph by wav
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/fbank_recognizer.log \
recognizer_main \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
@@ -163,7 +164,7 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
--word_symbol_table=$wfst/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--model_cache_shapes="5-1-2048,5-1-2048" \
- --nnet_decoder_chunk=8 \
+ --nnet_decoder_chunk=8 \
--graph_path=$wfst/TLG.fst --max_active=7500 \
--acoustic_scale=1.2 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_fbank_recognizer
diff --git a/speechx/examples/ds2_ol/websocket/README.md b/speechx/examples/ds2_ol/websocket/README.md
new file mode 100644
index 000000000..3fa84135f
--- /dev/null
+++ b/speechx/examples/ds2_ol/websocket/README.md
@@ -0,0 +1,78 @@
+# Streaming DeepSpeech2 Server with WebSocket
+
+This example is about using `websocket` as streaming deepspeech2 server. For deepspeech2 model training please see [here](../../../../examples/aishell/asr0/).
+
+The websocket protocal is same to [PaddleSpeech Server](../../../../demos/streaming_asr_server/),
+for detail of implementation please see [here](../../../speechx/protocol/websocket/).
+
+
+## Source path.sh
+
+```bash
+. path.sh
+```
+
+SpeechX bins is under `echo $SPEECHX_BUILD`, more info please see `path.sh`.
+
+
+## Start WebSocket Server
+
+```bash
+bash websoket_server.sh
+```
+
+The output is like below:
+
+```text
+I1130 02:19:32.029882 12856 cmvn_json2kaldi_main.cc:39] cmvn josn path: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/model/data/mean_std.json
+I1130 02:19:32.032230 12856 cmvn_json2kaldi_main.cc:73] nframe: 907497
+I1130 02:19:32.032564 12856 cmvn_json2kaldi_main.cc:85] cmvn stats have write into: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/cmvn.ark
+I1130 02:19:32.032579 12856 cmvn_json2kaldi_main.cc:86] Binary: 1
+I1130 02:19:32.798342 12937 feature_pipeline.h:53] cmvn file: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/cmvn.ark
+I1130 02:19:32.798542 12937 feature_pipeline.h:58] dither: 0
+I1130 02:19:32.798583 12937 feature_pipeline.h:60] frame shift ms: 10
+I1130 02:19:32.798588 12937 feature_pipeline.h:62] feature type: linear
+I1130 02:19:32.798596 12937 feature_pipeline.h:80] frame length ms: 20
+I1130 02:19:32.798601 12937 feature_pipeline.h:88] subsampling rate: 4
+I1130 02:19:32.798606 12937 feature_pipeline.h:90] nnet receptive filed length: 7
+I1130 02:19:32.798611 12937 feature_pipeline.h:92] nnet chunk size: 1
+I1130 02:19:32.798615 12937 feature_pipeline.h:94] frontend fill zeros: 0
+I1130 02:19:32.798630 12937 nnet_itf.h:52] subsampling rate: 4
+I1130 02:19:32.798635 12937 nnet_itf.h:54] model path: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/model/exp/deepspeech2_online/checkpoints//avg_1.jit.pdmodel
+I1130 02:19:32.798640 12937 nnet_itf.h:57] param path: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/model/exp/deepspeech2_online/checkpoints//avg_1.jit.pdiparams
+I1130 02:19:32.798643 12937 nnet_itf.h:59] DS2 param:
+I1130 02:19:32.798647 12937 nnet_itf.h:61] cache names: chunk_state_h_box,chunk_state_c_box
+I1130 02:19:32.798652 12937 nnet_itf.h:63] cache shape: 5-1-1024,5-1-1024
+I1130 02:19:32.798656 12937 nnet_itf.h:65] input names: audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_box
+I1130 02:19:32.798660 12937 nnet_itf.h:67] output names: softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0
+I1130 02:19:32.798664 12937 ctc_tlg_decoder.h:41] fst path: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/wfst//TLG.fst
+I1130 02:19:32.798669 12937 ctc_tlg_decoder.h:42] fst symbole table: /workspace/zhanghui/PaddleSpeech/speechx/examples/ds2_ol/websocket/data/wfst//words.txt
+I1130 02:19:32.798673 12937 ctc_tlg_decoder.h:47] LatticeFasterDecoder max active: 7500
+I1130 02:19:32.798677 12937 ctc_tlg_decoder.h:49] LatticeFasterDecoder beam: 15
+I1130 02:19:32.798681 12937 ctc_tlg_decoder.h:50] LatticeFasterDecoder lattice_beam: 7.5
+I1130 02:19:32.798708 12937 websocket_server_main.cc:37] Listening at port 8082
+```
+
+## Start WebSocket Client
+
+```bash
+bash websocket_client.sh
+```
+
+This script using AISHELL-1 test data to call websocket server.
+
+The input is specific by `--wav_rspecifier=scp:$data/$aishell_wav_scp`.
+
+The `scp` file which look like this:
+```text
+# head data/split1/1/aishell_test.scp
+BAC009S0764W0121 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0121.wav
+BAC009S0764W0122 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0122.wav
+...
+BAC009S0764W0125 /workspace/PaddleSpeech/speechx/examples/u2pp_ol/wenetspeech/data/test/S0764/BAC009S0764W0125.wav
+```
+
+If you want to recognize one wav, you can make `scp` file like this:
+```text
+key path/to/wav/file
+```
diff --git a/speechx/examples/u2pp_ol/wenetspeech/README.md b/speechx/examples/u2pp_ol/wenetspeech/README.md
index 6ca8f6dd8..6999fe3c7 100644
--- a/speechx/examples/u2pp_ol/wenetspeech/README.md
+++ b/speechx/examples/u2pp_ol/wenetspeech/README.md
@@ -6,13 +6,14 @@ This example will demonstrate how to using the u2/u2++ model to recognize `wav`
## Testing with Aishell Test Data
-### Source `path.sh` first
+## Source path.sh
-```bash
-source path.sh
+```bash
+. path.sh
```
-All bins are under `echo $SPEECHX_BUILD` dir.
+SpeechX bins is under `echo $SPEECHX_BUILD`, more info please see `path.sh`.
+
### Download dataset and model
diff --git a/speechx/examples/u2pp_ol/wenetspeech/run.sh b/speechx/examples/u2pp_ol/wenetspeech/run.sh
index 711d68083..4bbf79201 100755
--- a/speechx/examples/u2pp_ol/wenetspeech/run.sh
+++ b/speechx/examples/u2pp_ol/wenetspeech/run.sh
@@ -83,5 +83,10 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# decode with wav input
- ./loca/recognizer.sh
+ ./local/recognizer.sh
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # decode with wav input with quanted model
+ ./local/recognizer_quant.sh
fi