Add tts demo.

4 years ago · 1909f2f620
parent 3701fba0be
commit 1909f2f620
3 changed files with 112 additions and 2 deletions
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@ -0,0 +1,102 @@
+# TTS(Text To Speech)
+
+## Introduction
+Text-to-speech (TTS) is a natural language modeling process that requires changing units of text into units of speech for audio presentation. 
+
+This demo is an implementation to generate an audio from the giving text. It can be done by a single command or a few lines in python using `PaddleSpeech`. 
+
+## Usage
+### 1. Installation
+```bash
+pip install paddlespeech
+```
+
+### 2. Prepare Input
+Input of this demo should be a text of the specific language that can be passed via argument.
+
+
+### 3. Usage
+- Command Line(Recommended)
+  ```bash
+  paddlespeech tts --input 今天的天气不错啊
+  ```
+  Usage:
+  ```bash
+  paddlespeech tts --help
+  ```
+  Arguments:
+  - `input`(required): Input text to generate..
+  - `am`: Acoustic model type of tts task. Default: `fastspeech2_csmsc`.
+  - `am_config`: Config of acoustic model. Use deault config when it is None. Default: `None`.
+  - `am_ckpt`: Acoustic model checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `am_stat`: Mean and standard deviation used to normalize spectrogram when training acoustic model. Default: `None`.
+  - `phones_dict`: Phone vocabulary file. Default: `None`.
+  - `tones_dict`: Tone vocabulary file. Default: `None`.
+  - `speaker_dict`: speaker id map file. Default: `None`.
+  - `spk_id`: Speaker id for multi speaker acoustic model. Default: `0`.
+  - `voc`: Vocoder type of tts task. Default: `pwgan_csmsc`.
+  - `voc_config`: Config of vocoder. Use deault config when it is None. Default: `None`.
+  - `voc_ckpt`: Vocoder checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `voc_stat`: Mean and standard deviation used to normalize spectrogram when training vocoder. Default: `None`.
+  - `lang`: Language of tts task. Default: `zh`.
+  - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
+  - `output`: Output wave filepath. Default: `output.wav`.
+
+  Output:
+  ```bash
+  [2021-12-09 20:49:58,955] [    INFO] [log.py] [L57] - Wave file has been generated: output.wav
+  ```
+
+- Python API
+  ```python
+  import paddle
+  from paddlespeech.cli import TTSExecutor
+
+  tts_executor = TTSExecutor()
+  wav_file = tts_executor(
+      text='今天的天气不错啊',
+      output='output.wav',
+      am='fastspeech2_csmsc',
+      am_config=None,
+      am_ckpt=None,
+      am_stat=None,
+      spk_id=0,
+      phones_dict=None,
+      tones_dict=None,
+      speaker_dict=None,
+      voc='pwgan_csmsc',
+      voc_config=None,
+      voc_ckpt=None,
+      voc_stat=None,
+      lang='zh',
+      device=paddle.get_device())
+  print('Wave file has been generated: {}'.format(wav_file))
+  ```
+
+  Output:
+  ```bash
+  Wave file has been generated: output.wav
+  ```
+
+
+### 4.Pretrained Models
+
+Here is a list of pretrained models released by PaddleSpeech that can be used by command and python api:
+
+- Acoustic model
+  | Model | Language
+  | :--- | :---: |
+  | speedyspeech_csmsc| zh
+  | fastspeech2_csmsc| zh
+  | fastspeech2_aishell3| zh
+  | fastspeech2_ljspeech| en
+  | fastspeech2_vctk| en
+
+- Vocoder
+  | Model | Language
+  | :--- | :---: |
+  | pwgan_csmsc| zh
+  | pwgan_aishell3| zh
+  | pwgan_ljspeech| en
+  | pwgan_vctk| en
+  | mb_melgan_csmsc| zh
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@ -236,6 +236,7 @@ class TTSExecutor(BaseExecutor):
        self.parser.add_argument(
            "--am_stat",
            type=str,
+            default=None,
            help="mean and standard deviation used to normalize spectrogram when training acoustic model."
        )
        self.parser.add_argument(
@ -282,6 +283,7 @@ class TTSExecutor(BaseExecutor):
        self.parser.add_argument(
            "--voc_stat",
            type=str,
+            default=None,
            help="mean and standard deviation used to normalize spectrogram when training voc."
        )
        # other
@ -543,6 +545,7 @@ class TTSExecutor(BaseExecutor):
        Returns:
            Union[str, os.PathLike]: Human-readable results such as texts and audio files.
        """
+        output = os.path.abspath(os.path.expanduser(output))
        sf.write(
            output, self._outputs['wav'].numpy(), samplerate=self.am_config.fs)
        return output
@ -593,7 +596,7 @@ class TTSExecutor(BaseExecutor):
                lang=lang,
                device=device,
                output=output)
-            logger.info('TTS Result Saved in: {}'.format(res))
+            logger.info('Wave file has been generated: {}'.format(res))
            return True
        except Exception as e:
            logger.exception(e)
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@ -56,12 +56,14 @@ def get_command(name: str) -> Any:

 def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
    file_dir = os.path.dirname(filepath)
+    is_zip_file = False
    if tarfile.is_tarfile(filepath):
        files = tarfile.open(filepath, "r:*")
        file_list = files.getnames()
    elif zipfile.is_zipfile(filepath):
        files = zipfile.ZipFile(filepath, 'r')
        file_list = files.namelist()
+        is_zip_file = True
    else:
        return file_dir

@ -69,6 +71,9 @@ def _get_uncompress_path(filepath: os.PathLike) -> os.PathLike:
        rootpath = file_list[0]
        uncompressed_path = os.path.join(file_dir, rootpath)
    elif download._is_a_single_dir(file_list):
+        if is_zip_file:
+            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[0]
+        else:
            rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
        uncompressed_path = os.path.join(file_dir, rootpath)
    else: