From 4be42926585f89c2f85088beafb545b00664ffb5 Mon Sep 17 00:00:00 2001
From: fucong <i4never@163.com>
Date: Mon, 27 Mar 2023 16:07:51 +0800
Subject: [PATCH] [TTS] fastspeech2.infer with p/e/d control, test=tts

---
 paddlespeech/cli/tts/infer.py                 | 40 ++++++++-
 .../t2s/models/fastspeech2/fastspeech2.py     | 83 ++++++++++++++++++-
 2 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 4787e1eeb..d4465041a 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -457,6 +457,16 @@ class TTSExecutor(BaseExecutor):
               text: str,
               lang: str='zh',
               am: str='fastspeech2_csmsc',
+              durations_scale: float=None,
+              durations_bias: float=None,
+              pitch_scale: float = None,
+              pitch_bias: float = None,
+              pitch_stats_mean: float = None,
+              pitch_stats_std: float = None,
+              energy_scale: float = None,
+              energy_bias: float = None,
+              energy_stats_mean: float = None,
+              energy_stats_std: float = None,
               spk_id: int=0):
         """
         Model inference and result stored in self.output.
@@ -493,7 +503,16 @@ class TTSExecutor(BaseExecutor):
                     mel = self.am_inference(
                         part_phone_ids, spk_id=paddle.to_tensor(spk_id))
                 else:
-                    mel = self.am_inference(part_phone_ids)
+                    use_teacher_forcing = any([a is not None for a in [durations_scale, durations_bias,
+                                                                       pitch_scale, pitch_bias,
+                                                                       energy_scale, energy_bias]])
+                    mel = self.am_inference(part_phone_ids,
+                                            durations_scale=durations_scale, durations_bias=durations_bias,
+                                            pitch_scale=pitch_scale, pitch_bias=pitch_bias,
+                                            pitch_stats_mean=pitch_stats_mean, pitch_stats_std=pitch_stats_std,
+                                            energy_scale=energy_scale, energy_bias=energy_bias,
+                                            energy_stats_mean=energy_stats_mean, energy_stats_std=energy_stats_std,
+                                            use_teacher_forcing=use_teacher_forcing)
             self.am_time += (time.time() - am_st)
             # voc
             voc_st = time.time()
@@ -687,7 +706,17 @@ class TTSExecutor(BaseExecutor):
                  output: str='output.wav',
                  use_onnx: bool=False,
                  cpu_threads: int=2,
-                 fs: int=24000):
+                 fs: int=24000,
+                 durations_scale=None,
+                 durations_bias=None,
+                 pitch_scale=None,
+                 pitch_bias=None,
+                 pitch_stats_mean=None,
+                 pitch_stats_std=None,
+                 energy_scale=None,
+                 energy_bias=None,
+                 energy_stats_mean=None,
+                 energy_stats_std=None):
         """
         Python API to call an executor.
         """
@@ -707,7 +736,12 @@ class TTSExecutor(BaseExecutor):
                 voc_stat=voc_stat,
                 lang=lang)
 
-            self.infer(text=text, lang=lang, am=am, spk_id=spk_id)
+            self.infer(text=text, lang=lang, am=am, spk_id=spk_id,
+                       durations_scale=durations_scale, durations_bias=durations_bias,
+                       pitch_scale=pitch_scale, pitch_bias=pitch_bias,
+                       pitch_stats_mean=pitch_stats_mean, pitch_stats_std=pitch_stats_std,
+                       energy_scale=energy_scale, energy_bias=energy_bias,
+                       energy_stats_mean=energy_stats_mean, energy_stats_std=energy_stats_std)
             res = self.postprocess(output=output)
             return res
         else:
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 8ce19795e..f2bd7c69a 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -596,6 +596,16 @@ class FastSpeech2(nn.Layer):
                  ds: paddle.Tensor=None,
                  ps: paddle.Tensor=None,
                  es: paddle.Tensor=None,
+                 durations_scale: float=None,
+                 durations_bias: float=None,
+                 pitch_scale: float = None,
+                 pitch_bias: float = None,
+                 pitch_stats_mean: float = None,
+                 pitch_stats_std: float = None,
+                 energy_scale: float = None,
+                 energy_bias: float = None,
+                 energy_stats_mean: float = None,
+                 energy_stats_std: float = None,
                  is_inference: bool=False,
                  return_after_enc=False,
                  alpha: float=1.0,
@@ -645,10 +655,32 @@ class FastSpeech2(nn.Layer):
                 d_outs = ds
             else:
                 d_outs = self.duration_predictor.inference(hs, d_masks)
+            if durations_scale is not None or durations_bias is not None:
+                durations_scale = durations_scale if durations_scale is not None else 1
+                durations_bias = durations_bias if durations_bias is not None else 0
+                d_outs = durations_scale * d_outs + durations_bias
+
+            # pitch控制
             if ps is not None:
                 p_outs = ps
+            elif pitch_scale is not None or pitch_bias is not None:
+                pitch_scale = pitch_scale if pitch_scale is not None else 1
+                pitch_bias = pitch_bias if pitch_bias is not None else 0
+                assert pitch_stats_mean is not None and pitch_stats_std is not None
+                p_outs = paddle.exp(p_outs * pitch_stats_std + pitch_stats_mean)
+                p_outs = p_outs * pitch_scale + pitch_bias
+                p_outs = (paddle.log(p_outs) - pitch_stats_mean) / pitch_stats_std
+
+            # energy控制
             if es is not None:
                 e_outs = es
+            elif energy_scale is not None or energy_bias is not None:
+                energy_scale = energy_scale if energy_scale is not None else 1
+                energy_bias = energy_bias if energy_bias is not None else 0
+                assert energy_stats_mean is not None and energy_stats_std is not None
+                e_outs = paddle.exp(e_outs * energy_stats_std + energy_stats_mean)
+                e_outs = e_outs * energy_scale + energy_bias
+                e_outs = (paddle.log(e_outs) - energy_stats_mean) / energy_stats_std
 
             # use prediction in inference
             # (B, Tmax, 1)
@@ -747,6 +779,16 @@ class FastSpeech2(nn.Layer):
             durations: paddle.Tensor=None,
             pitch: paddle.Tensor=None,
             energy: paddle.Tensor=None,
+            durations_scale: float = None,
+            durations_bias: float = None,
+            pitch_scale: float = None,
+            pitch_bias: float = None,
+            pitch_stats_mean: float = None,
+            pitch_stats_std: float = None,
+            energy_scale: float = None,
+            energy_bias: float = None,
+            energy_stats_mean: float = None,
+            energy_stats_std: float = None,
             alpha: float=1.0,
             use_teacher_forcing: bool=False,
             spk_emb=None,
@@ -764,6 +806,26 @@ class FastSpeech2(nn.Layer):
                 Groundtruth of token-averaged pitch (T, 1).
             energy(Tensor, optional): 
                 Groundtruth of token-averaged energy (T, 1).
+            durations_scale: (int, float, optional):
+                For duration control during infer
+            durations_bias: (int, float, optional):
+                For duration control during infer
+            pitch_scale:(int, float, optional):
+                For pitch control during infer
+            pitch_bias:(int, float, optional):
+                For pitch control during infer
+            pitch_stats_mean:(int, float, optional):
+                For pitch control during infer
+            pitch_stats_std:(int, float, optional):
+                For pitch control during infer
+            energy_scale:(int, float, optional):
+                For energy control during infer
+            energy_bias:(int, float, optional):
+                For energy control during infer
+            energy_stats_mean:(int, float, optional):
+                For energy control during infer
+            energy_stats_std:(int, float, optional):
+                For energy control during infer
             alpha(float, optional): 
                 Alpha to control the speed.
             use_teacher_forcing(bool, optional): 
@@ -806,9 +868,20 @@ class FastSpeech2(nn.Layer):
                 ds=ds,
                 ps=ps,
                 es=es,
+                durations_scale=durations_scale,
+                durations_bias=durations_bias,
+                pitch_scale=pitch_scale,
+                pitch_bias=pitch_bias,
+                pitch_stats_mean=pitch_stats_mean,
+                pitch_stats_std=pitch_stats_std,
+                energy_scale=energy_scale,
+                energy_bias=energy_bias,
+                energy_stats_mean=energy_stats_mean,
+                energy_stats_std=energy_stats_std,
                 spk_emb=spk_emb,
                 spk_id=spk_id,
                 tone_id=tone_id,
+                alpha=alpha,
                 is_inference=True)
         else:
             # (1, L, odim)
@@ -921,9 +994,15 @@ class FastSpeech2Inference(nn.Layer):
         self.normalizer = normalizer
         self.acoustic_model = model
 
-    def forward(self, text, spk_id=None, spk_emb=None):
+    def forward(self, text, spk_id=None, spk_emb=None, durations_scale=None, durations_bias=None, pitch_scale=None,
+                pitch_bias=None, pitch_stats_mean=None, pitch_stats_std=None, energy_scale=None, energy_bias=None,
+                energy_stats_mean=None, energy_stats_std=None, use_teacher_forcing=False):
         normalized_mel, d_outs, p_outs, e_outs = self.acoustic_model.inference(
-            text, spk_id=spk_id, spk_emb=spk_emb)
+            text, spk_id=spk_id, spk_emb=spk_emb,  durations_scale=durations_scale, durations_bias=durations_bias,
+            pitch_scale=pitch_scale, pitch_bias=pitch_bias, pitch_stats_mean=pitch_stats_mean,
+            pitch_stats_std=pitch_stats_std, energy_scale=energy_scale, energy_bias=energy_bias,
+            energy_stats_mean=energy_stats_mean, energy_stats_std=energy_stats_std,
+            use_teacher_forcing=use_teacher_forcing)
         logmel = self.normalizer.inverse(normalized_mel)
         return logmel