Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into dev
@ -0,0 +1,30 @@
|
||||
# .readthedocs.yml
|
||||
# Read the Docs configuration file
|
||||
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
||||
|
||||
# Required
|
||||
version: 2
|
||||
|
||||
# Build documentation in the docs/ directory with Sphinx
|
||||
sphinx:
|
||||
configuration: docs/src/conf.py
|
||||
|
||||
# Build documentation with MkDocs
|
||||
#mkdocs:
|
||||
# configuration: mkdocs.yml
|
||||
|
||||
# Optionally build your docs in additional formats such as PDF
|
||||
formats: []
|
||||
|
||||
# Optionally set the version of Python and requirements required to build your docs
|
||||
python:
|
||||
version: 3.7
|
||||
install:
|
||||
- method: pip
|
||||
path: .
|
||||
extra_requirements:
|
||||
- doc
|
||||
|
||||
- requirements: docs/requirements.txt
|
||||
|
||||
|
@ -0,0 +1,528 @@
|
||||
"""Beam search module."""
|
||||
|
||||
from itertools import chain
|
||||
import logger
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
from typing import NamedTuple
|
||||
from typing import Tuple
|
||||
from typing import Union
|
||||
|
||||
import paddle
|
||||
|
||||
from .utils import end_detect
|
||||
from .scorers.scorer_interface import PartialScorerInterface
|
||||
from .scorers.scorer_interface import ScorerInterface
|
||||
|
||||
from deepspeech.utils.log import Log
|
||||
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
class Hypothesis(NamedTuple):
|
||||
"""Hypothesis data type."""
|
||||
|
||||
yseq: paddle.Tensor # (T,)
|
||||
score: Union[float, paddle.Tensor] = 0
|
||||
scores: Dict[str, Union[float, paddle.Tensor]] = dict()
|
||||
states: Dict[str, Any] = dict()
|
||||
|
||||
def asdict(self) -> dict:
|
||||
"""Convert data to JSON-friendly dict."""
|
||||
return self._replace(
|
||||
yseq=self.yseq.tolist(),
|
||||
score=float(self.score),
|
||||
scores={k: float(v) for k, v in self.scores.items()},
|
||||
)._asdict()
|
||||
|
||||
|
||||
class BeamSearch(paddle.nn.Layer):
|
||||
"""Beam search implementation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scorers: Dict[str, ScorerInterface],
|
||||
weights: Dict[str, float],
|
||||
beam_size: int,
|
||||
vocab_size: int,
|
||||
sos: int,
|
||||
eos: int,
|
||||
token_list: List[str] = None,
|
||||
pre_beam_ratio: float = 1.5,
|
||||
pre_beam_score_key: str = None,
|
||||
):
|
||||
"""Initialize beam search.
|
||||
|
||||
Args:
|
||||
scorers (dict[str, ScorerInterface]): Dict of decoder modules
|
||||
e.g., Decoder, CTCPrefixScorer, LM
|
||||
The scorer will be ignored if it is `None`
|
||||
weights (dict[str, float]): Dict of weights for each scorers
|
||||
The scorer will be ignored if its weight is 0
|
||||
beam_size (int): The number of hypotheses kept during search
|
||||
vocab_size (int): The number of vocabulary
|
||||
sos (int): Start of sequence id
|
||||
eos (int): End of sequence id
|
||||
token_list (list[str]): List of tokens for debug log
|
||||
pre_beam_score_key (str): key of scores to perform pre-beam search
|
||||
pre_beam_ratio (float): beam size in the pre-beam search
|
||||
will be `int(pre_beam_ratio * beam_size)`
|
||||
|
||||
"""
|
||||
super().__init__()
|
||||
# set scorers
|
||||
self.weights = weights
|
||||
self.scorers = dict() # all = full + partial
|
||||
self.full_scorers = dict() # full tokens
|
||||
self.part_scorers = dict() # partial tokens
|
||||
# this module dict is required for recursive cast
|
||||
# `self.to(device, dtype)` in `recog.py`
|
||||
self.nn_dict = paddle.nn.LayerDict() # nn.Layer
|
||||
for k, v in scorers.items():
|
||||
w = weights.get(k, 0)
|
||||
if w == 0 or v is None:
|
||||
continue
|
||||
assert isinstance(
|
||||
v, ScorerInterface
|
||||
), f"{k} ({type(v)}) does not implement ScorerInterface"
|
||||
self.scorers[k] = v
|
||||
if isinstance(v, PartialScorerInterface):
|
||||
self.part_scorers[k] = v
|
||||
else:
|
||||
self.full_scorers[k] = v
|
||||
if isinstance(v, paddle.nn.Layer):
|
||||
self.nn_dict[k] = v
|
||||
|
||||
# set configurations
|
||||
self.sos = sos
|
||||
self.eos = eos
|
||||
self.token_list = token_list
|
||||
# pre_beam_size > beam_size
|
||||
self.pre_beam_size = int(pre_beam_ratio * beam_size)
|
||||
self.beam_size = beam_size
|
||||
self.n_vocab = vocab_size
|
||||
if (
|
||||
pre_beam_score_key is not None
|
||||
and pre_beam_score_key != "full"
|
||||
and pre_beam_score_key not in self.full_scorers
|
||||
):
|
||||
raise KeyError(f"{pre_beam_score_key} is not found in {self.full_scorers}")
|
||||
# selected `key` scorer to do pre beam search
|
||||
self.pre_beam_score_key = pre_beam_score_key
|
||||
# do_pre_beam when need, valid and has part_scorers
|
||||
self.do_pre_beam = (
|
||||
self.pre_beam_score_key is not None
|
||||
and self.pre_beam_size < self.n_vocab
|
||||
and len(self.part_scorers) > 0
|
||||
)
|
||||
|
||||
def init_hyp(self, x: paddle.Tensor) -> List[Hypothesis]:
|
||||
"""Get an initial hypothesis data.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): The encoder output feature, (T, D)
|
||||
|
||||
Returns:
|
||||
Hypothesis: The initial hypothesis.
|
||||
|
||||
"""
|
||||
init_states = dict()
|
||||
init_scores = dict()
|
||||
for k, d in self.scorers.items():
|
||||
init_states[k] = d.init_state(x)
|
||||
init_scores[k] = 0.0
|
||||
return [
|
||||
Hypothesis(
|
||||
yseq=paddle.to_tensor([self.sos], place=x.place),
|
||||
score=0.0,
|
||||
scores=init_scores,
|
||||
states=init_states,
|
||||
)
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def append_token(xs: paddle.Tensor, x: int) -> paddle.Tensor:
|
||||
"""Append new token to prefix tokens.
|
||||
|
||||
Args:
|
||||
xs (paddle.Tensor): The prefix token, (T,)
|
||||
x (int): The new token to append
|
||||
|
||||
Returns:
|
||||
paddle.Tensor: (T+1,), New tensor contains: xs + [x] with xs.dtype and xs.device
|
||||
|
||||
"""
|
||||
x = paddle.to_tensor([x], dtype=xs.dtype, place=xs.place)
|
||||
return paddle.cat((xs, x))
|
||||
|
||||
def score_full(
|
||||
self, hyp: Hypothesis, x: paddle.Tensor
|
||||
) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
|
||||
"""Score new hypothesis by `self.full_scorers`.
|
||||
|
||||
Args:
|
||||
hyp (Hypothesis): Hypothesis with prefix tokens to score
|
||||
x (paddle.Tensor): Corresponding input feature, (T, D)
|
||||
|
||||
Returns:
|
||||
Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
|
||||
score dict of `hyp` that has string keys of `self.full_scorers`
|
||||
and tensor score values of shape: `(self.n_vocab,)`,
|
||||
and state dict that has string keys
|
||||
and state values of `self.full_scorers`
|
||||
|
||||
"""
|
||||
scores = dict()
|
||||
states = dict()
|
||||
for k, d in self.full_scorers.items():
|
||||
# scores[k] shape (self.n_vocab,)
|
||||
scores[k], states[k] = d.score(hyp.yseq, hyp.states[k], x)
|
||||
return scores, states
|
||||
|
||||
def score_partial(
|
||||
self, hyp: Hypothesis, ids: paddle.Tensor, x: paddle.Tensor
|
||||
) -> Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]:
|
||||
"""Score new hypothesis by `self.part_scorers`.
|
||||
|
||||
Args:
|
||||
hyp (Hypothesis): Hypothesis with prefix tokens to score
|
||||
ids (paddle.Tensor): 1D tensor of new partial tokens to score,
|
||||
len(ids) < n_vocab
|
||||
x (paddle.Tensor): Corresponding input feature, (T, D)
|
||||
|
||||
Returns:
|
||||
Tuple[Dict[str, paddle.Tensor], Dict[str, Any]]: Tuple of
|
||||
score dict of `hyp` that has string keys of `self.part_scorers`
|
||||
and tensor score values of shape: `(len(ids),)`,
|
||||
and state dict that has string keys
|
||||
and state values of `self.part_scorers`
|
||||
|
||||
"""
|
||||
scores = dict()
|
||||
states = dict()
|
||||
for k, d in self.part_scorers.items():
|
||||
# scores[k] shape (len(ids),)
|
||||
scores[k], states[k] = d.score_partial(hyp.yseq, ids, hyp.states[k], x)
|
||||
return scores, states
|
||||
|
||||
def beam(
|
||||
self, weighted_scores: paddle.Tensor, ids: paddle.Tensor
|
||||
) -> Tuple[paddle.Tensor, paddle.Tensor]:
|
||||
"""Compute topk full token ids and partial token ids.
|
||||
|
||||
Args:
|
||||
weighted_scores (paddle.Tensor): The weighted sum scores for each tokens.
|
||||
Its shape is `(self.n_vocab,)`.
|
||||
ids (paddle.Tensor): The partial token ids(Global) to compute topk.
|
||||
|
||||
Returns:
|
||||
Tuple[paddle.Tensor, paddle.Tensor]:
|
||||
The topk full token ids and partial token ids.
|
||||
Their shapes are `(self.beam_size,)`.
|
||||
i.e. (global ids, global relative local ids).
|
||||
|
||||
"""
|
||||
# no pre beam performed, `ids` equal to `weighted_scores`
|
||||
if weighted_scores.size(0) == ids.size(0):
|
||||
top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
|
||||
return top_ids, top_ids
|
||||
|
||||
# mask pruned in pre-beam not to select in topk
|
||||
tmp = weighted_scores[ids]
|
||||
weighted_scores[:] = -float("inf")
|
||||
weighted_scores[ids] = tmp
|
||||
# top_ids no equal to local_ids, since ids shape not same
|
||||
top_ids = weighted_scores.topk(self.beam_size)[1] # index in n_vocab
|
||||
local_ids = weighted_scores[ids].topk(self.beam_size)[1] # index in len(ids)
|
||||
return top_ids, local_ids
|
||||
|
||||
@staticmethod
|
||||
def merge_scores(
|
||||
prev_scores: Dict[str, float],
|
||||
next_full_scores: Dict[str, paddle.Tensor],
|
||||
full_idx: int,
|
||||
next_part_scores: Dict[str, paddle.Tensor],
|
||||
part_idx: int,
|
||||
) -> Dict[str, paddle.Tensor]:
|
||||
"""Merge scores for new hypothesis.
|
||||
|
||||
Args:
|
||||
prev_scores (Dict[str, float]):
|
||||
The previous hypothesis scores by `self.scorers`
|
||||
next_full_scores (Dict[str, paddle.Tensor]): scores by `self.full_scorers`
|
||||
full_idx (int): The next token id for `next_full_scores`
|
||||
next_part_scores (Dict[str, paddle.Tensor]):
|
||||
scores of partial tokens by `self.part_scorers`
|
||||
part_idx (int): The new token id for `next_part_scores`
|
||||
|
||||
Returns:
|
||||
Dict[str, paddle.Tensor]: The new score dict.
|
||||
Its keys are names of `self.full_scorers` and `self.part_scorers`.
|
||||
Its values are scalar tensors by the scorers.
|
||||
|
||||
"""
|
||||
new_scores = dict()
|
||||
for k, v in next_full_scores.items():
|
||||
new_scores[k] = prev_scores[k] + v[full_idx]
|
||||
for k, v in next_part_scores.items():
|
||||
new_scores[k] = prev_scores[k] + v[part_idx]
|
||||
return new_scores
|
||||
|
||||
def merge_states(self, states: Any, part_states: Any, part_idx: int) -> Any:
|
||||
"""Merge states for new hypothesis.
|
||||
|
||||
Args:
|
||||
states: states of `self.full_scorers`
|
||||
part_states: states of `self.part_scorers`
|
||||
part_idx (int): The new token id for `part_scores`
|
||||
|
||||
Returns:
|
||||
Dict[str, paddle.Tensor]: The new score dict.
|
||||
Its keys are names of `self.full_scorers` and `self.part_scorers`.
|
||||
Its values are states of the scorers.
|
||||
|
||||
"""
|
||||
new_states = dict()
|
||||
for k, v in states.items():
|
||||
new_states[k] = v
|
||||
for k, d in self.part_scorers.items():
|
||||
new_states[k] = d.select_state(part_states[k], part_idx)
|
||||
return new_states
|
||||
|
||||
def search(
|
||||
self, running_hyps: List[Hypothesis], x: paddle.Tensor
|
||||
) -> List[Hypothesis]:
|
||||
"""Search new tokens for running hypotheses and encoded speech x.
|
||||
|
||||
Args:
|
||||
running_hyps (List[Hypothesis]): Running hypotheses on beam
|
||||
x (paddle.Tensor): Encoded speech feature (T, D)
|
||||
|
||||
Returns:
|
||||
List[Hypotheses]: Best sorted hypotheses
|
||||
|
||||
"""
|
||||
best_hyps = []
|
||||
part_ids = paddle.arange(self.n_vocab) # no pre-beam
|
||||
for hyp in running_hyps:
|
||||
# scoring
|
||||
weighted_scores = paddle.zeros(self.n_vocab, dtype=x.dtype)
|
||||
scores, states = self.score_full(hyp, x)
|
||||
for k in self.full_scorers:
|
||||
weighted_scores += self.weights[k] * scores[k]
|
||||
# partial scoring
|
||||
if self.do_pre_beam:
|
||||
pre_beam_scores = (
|
||||
weighted_scores
|
||||
if self.pre_beam_score_key == "full"
|
||||
else scores[self.pre_beam_score_key]
|
||||
)
|
||||
part_ids = paddle.topk(pre_beam_scores, self.pre_beam_size)[1]
|
||||
part_scores, part_states = self.score_partial(hyp, part_ids, x)
|
||||
for k in self.part_scorers:
|
||||
weighted_scores[part_ids] += self.weights[k] * part_scores[k]
|
||||
# add previous hyp score
|
||||
weighted_scores += hyp.score
|
||||
|
||||
# update hyps
|
||||
for j, part_j in zip(*self.beam(weighted_scores, part_ids)):
|
||||
# `part_j` is `j` relative id in `part_scores`
|
||||
# will be (2 x beam at most)
|
||||
best_hyps.append(
|
||||
Hypothesis(
|
||||
score=weighted_scores[j],
|
||||
yseq=self.append_token(hyp.yseq, j),
|
||||
scores=self.merge_scores(
|
||||
hyp.scores, scores, j, part_scores, part_j
|
||||
),
|
||||
states=self.merge_states(states, part_states, part_j),
|
||||
)
|
||||
)
|
||||
|
||||
# sort and prune 2 x beam -> beam
|
||||
best_hyps = sorted(best_hyps, key=lambda x: x.score, reverse=True)[
|
||||
: min(len(best_hyps), self.beam_size)
|
||||
]
|
||||
return best_hyps
|
||||
|
||||
def forward(
|
||||
self, x: paddle.Tensor, maxlenratio: float = 0.0, minlenratio: float = 0.0
|
||||
) -> List[Hypothesis]:
|
||||
"""Perform beam search.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): Encoded speech feature (T, D)
|
||||
maxlenratio (float): Input length ratio to obtain max output length.
|
||||
If maxlenratio=0.0 (default), it uses a end-detect function
|
||||
to automatically find maximum hypothesis lengths
|
||||
If maxlenratio<0.0, its absolute value is interpreted
|
||||
as a constant max output length.
|
||||
minlenratio (float): Input length ratio to obtain min output length.
|
||||
|
||||
Returns:
|
||||
list[Hypothesis]: N-best decoding results
|
||||
|
||||
"""
|
||||
# set length bounds
|
||||
if maxlenratio == 0:
|
||||
maxlen = x.shape[0]
|
||||
elif maxlenratio < 0:
|
||||
maxlen = -1 * int(maxlenratio)
|
||||
else:
|
||||
maxlen = max(1, int(maxlenratio * x.size(0)))
|
||||
minlen = int(minlenratio * x.size(0))
|
||||
logger.info("decoder input length: " + str(x.shape[0]))
|
||||
logger.info("max output length: " + str(maxlen))
|
||||
logger.info("min output length: " + str(minlen))
|
||||
|
||||
# main loop of prefix search
|
||||
running_hyps = self.init_hyp(x)
|
||||
ended_hyps = []
|
||||
for i in range(maxlen):
|
||||
logger.debug("position " + str(i))
|
||||
best = self.search(running_hyps, x)
|
||||
# post process of one iteration
|
||||
running_hyps = self.post_process(i, maxlen, maxlenratio, best, ended_hyps)
|
||||
# end detection
|
||||
if maxlenratio == 0.0 and end_detect([h.asdict() for h in ended_hyps], i):
|
||||
logger.info(f"end detected at {i}")
|
||||
break
|
||||
if len(running_hyps) == 0:
|
||||
logger.info("no hypothesis. Finish decoding.")
|
||||
break
|
||||
else:
|
||||
logger.debug(f"remained hypotheses: {len(running_hyps)}")
|
||||
|
||||
nbest_hyps = sorted(ended_hyps, key=lambda x: x.score, reverse=True)
|
||||
# check the number of hypotheses reaching to eos
|
||||
if len(nbest_hyps) == 0:
|
||||
logger.warning(
|
||||
"there is no N-best results, perform recognition "
|
||||
"again with smaller minlenratio."
|
||||
)
|
||||
return (
|
||||
[]
|
||||
if minlenratio < 0.1
|
||||
else self.forward(x, maxlenratio, max(0.0, minlenratio - 0.1))
|
||||
)
|
||||
|
||||
# report the best result
|
||||
best = nbest_hyps[0]
|
||||
for k, v in best.scores.items():
|
||||
logger.info(
|
||||
f"{v:6.2f} * {self.weights[k]:3} = {v * self.weights[k]:6.2f} for {k}"
|
||||
)
|
||||
logger.info(f"total log probability: {best.score:.2f}")
|
||||
logger.info(f"normalized log probability: {best.score / len(best.yseq):.2f}")
|
||||
logger.info(f"total number of ended hypotheses: {len(nbest_hyps)}")
|
||||
if self.token_list is not None:
|
||||
logger.info(
|
||||
"best hypo: "
|
||||
+ "".join([self.token_list[x] for x in best.yseq[1:-1]])
|
||||
+ "\n"
|
||||
)
|
||||
return nbest_hyps
|
||||
|
||||
def post_process(
|
||||
self,
|
||||
i: int,
|
||||
maxlen: int,
|
||||
maxlenratio: float,
|
||||
running_hyps: List[Hypothesis],
|
||||
ended_hyps: List[Hypothesis],
|
||||
) -> List[Hypothesis]:
|
||||
"""Perform post-processing of beam search iterations.
|
||||
|
||||
Args:
|
||||
i (int): The length of hypothesis tokens.
|
||||
maxlen (int): The maximum length of tokens in beam search.
|
||||
maxlenratio (int): The maximum length ratio in beam search.
|
||||
running_hyps (List[Hypothesis]): The running hypotheses in beam search.
|
||||
ended_hyps (List[Hypothesis]): The ended hypotheses in beam search.
|
||||
|
||||
Returns:
|
||||
List[Hypothesis]: The new running hypotheses.
|
||||
|
||||
"""
|
||||
logger.debug(f"the number of running hypotheses: {len(running_hyps)}")
|
||||
if self.token_list is not None:
|
||||
logger.debug(
|
||||
"best hypo: "
|
||||
+ "".join([self.token_list[x] for x in running_hyps[0].yseq[1:]])
|
||||
)
|
||||
# add eos in the final loop to avoid that there are no ended hyps
|
||||
if i == maxlen - 1:
|
||||
logger.info("adding <eos> in the last position in the loop")
|
||||
running_hyps = [
|
||||
h._replace(yseq=self.append_token(h.yseq, self.eos))
|
||||
for h in running_hyps
|
||||
]
|
||||
|
||||
# add ended hypotheses to a final list, and removed them from current hypotheses
|
||||
# (this will be a problem, number of hyps < beam)
|
||||
remained_hyps = []
|
||||
for hyp in running_hyps:
|
||||
if hyp.yseq[-1] == self.eos:
|
||||
# e.g., Word LM needs to add final <eos> score
|
||||
for k, d in chain(self.full_scorers.items(), self.part_scorers.items()):
|
||||
s = d.final_score(hyp.states[k])
|
||||
hyp.scores[k] += s
|
||||
hyp = hyp._replace(score=hyp.score + self.weights[k] * s)
|
||||
ended_hyps.append(hyp)
|
||||
else:
|
||||
remained_hyps.append(hyp)
|
||||
return remained_hyps
|
||||
|
||||
|
||||
def beam_search(
|
||||
x: paddle.Tensor,
|
||||
sos: int,
|
||||
eos: int,
|
||||
beam_size: int,
|
||||
vocab_size: int,
|
||||
scorers: Dict[str, ScorerInterface],
|
||||
weights: Dict[str, float],
|
||||
token_list: List[str] = None,
|
||||
maxlenratio: float = 0.0,
|
||||
minlenratio: float = 0.0,
|
||||
pre_beam_ratio: float = 1.5,
|
||||
pre_beam_score_key: str = "full",
|
||||
) -> list:
|
||||
"""Perform beam search with scorers.
|
||||
|
||||
Args:
|
||||
x (paddle.Tensor): Encoded speech feature (T, D)
|
||||
sos (int): Start of sequence id
|
||||
eos (int): End of sequence id
|
||||
beam_size (int): The number of hypotheses kept during search
|
||||
vocab_size (int): The number of vocabulary
|
||||
scorers (dict[str, ScorerInterface]): Dict of decoder modules
|
||||
e.g., Decoder, CTCPrefixScorer, LM
|
||||
The scorer will be ignored if it is `None`
|
||||
weights (dict[str, float]): Dict of weights for each scorers
|
||||
The scorer will be ignored if its weight is 0
|
||||
token_list (list[str]): List of tokens for debug log
|
||||
maxlenratio (float): Input length ratio to obtain max output length.
|
||||
If maxlenratio=0.0 (default), it uses a end-detect function
|
||||
to automatically find maximum hypothesis lengths
|
||||
minlenratio (float): Input length ratio to obtain min output length.
|
||||
pre_beam_score_key (str): key of scores to perform pre-beam search
|
||||
pre_beam_ratio (float): beam size in the pre-beam search
|
||||
will be `int(pre_beam_ratio * beam_size)`
|
||||
|
||||
Returns:
|
||||
List[Dict]: N-best decoding results
|
||||
|
||||
"""
|
||||
ret = BeamSearch(
|
||||
scorers,
|
||||
weights,
|
||||
beam_size=beam_size,
|
||||
vocab_size=vocab_size,
|
||||
pre_beam_ratio=pre_beam_ratio,
|
||||
pre_beam_score_key=pre_beam_score_key,
|
||||
sos=sos,
|
||||
eos=eos,
|
||||
token_list=token_list,
|
||||
).forward(x=x, maxlenratio=maxlenratio, minlenratio=minlenratio)
|
||||
return [h.asdict() for h in ret]
|
@ -0,0 +1,187 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Evaluation for U2 model."""
|
||||
import cProfile
|
||||
import os
|
||||
import sys
|
||||
|
||||
import paddle
|
||||
import soundfile
|
||||
|
||||
from deepspeech.exps.u2.config import get_cfg_defaults
|
||||
from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
|
||||
from deepspeech.io.collator import SpeechCollator
|
||||
from deepspeech.models.u2 import U2Model
|
||||
from deepspeech.training.cli import default_argument_parser
|
||||
from deepspeech.training.trainer import Trainer
|
||||
from deepspeech.utils import layer_tools
|
||||
from deepspeech.utils import mp_tools
|
||||
from deepspeech.utils.log import Log
|
||||
from deepspeech.utils.utility import print_arguments
|
||||
from deepspeech.utils.utility import UpdateConfig
|
||||
logger = Log(__name__).getlog()
|
||||
|
||||
# TODO(hui zhang): dynamic load
|
||||
|
||||
|
||||
class U2Tester_Hub(Trainer):
|
||||
def __init__(self, config, args):
|
||||
# super().__init__(config, args)
|
||||
self.args = args
|
||||
self.config = config
|
||||
self.audio_file = args.audio_file
|
||||
self.collate_fn_test = SpeechCollator.from_config(config)
|
||||
self._text_featurizer = TextFeaturizer(
|
||||
unit_type=config.collator.unit_type,
|
||||
vocab_filepath=None,
|
||||
spm_model_prefix=config.collator.spm_model_prefix)
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model_conf = config.model
|
||||
|
||||
with UpdateConfig(model_conf):
|
||||
model_conf.input_dim = self.collate_fn_test.feature_size
|
||||
model_conf.output_dim = self.collate_fn_test.vocab_size
|
||||
|
||||
model = U2Model.from_config(model_conf)
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
logger.info(f"{model}")
|
||||
layer_tools.print_params(model, logger.info)
|
||||
|
||||
self.model = model
|
||||
logger.info("Setup model")
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def test(self):
|
||||
self.model.eval()
|
||||
cfg = self.config.decoding
|
||||
audio_file = self.audio_file
|
||||
collate_fn_test = self.collate_fn_test
|
||||
audio, _ = collate_fn_test.process_utterance(
|
||||
audio_file=audio_file, transcript="Hello")
|
||||
audio_len = audio.shape[0]
|
||||
audio = paddle.to_tensor(audio, dtype='float32')
|
||||
audio_len = paddle.to_tensor(audio_len)
|
||||
audio = paddle.unsqueeze(audio, axis=0)
|
||||
vocab_list = collate_fn_test.vocab_list
|
||||
|
||||
text_feature = self.collate_fn_test.text_feature
|
||||
result_transcripts = self.model.decode(
|
||||
audio,
|
||||
audio_len,
|
||||
text_feature=text_feature,
|
||||
decoding_method=cfg.decoding_method,
|
||||
lang_model_path=cfg.lang_model_path,
|
||||
beam_alpha=cfg.alpha,
|
||||
beam_beta=cfg.beta,
|
||||
beam_size=cfg.beam_size,
|
||||
cutoff_prob=cfg.cutoff_prob,
|
||||
cutoff_top_n=cfg.cutoff_top_n,
|
||||
num_processes=cfg.num_proc_bsearch,
|
||||
ctc_weight=cfg.ctc_weight,
|
||||
decoding_chunk_size=cfg.decoding_chunk_size,
|
||||
num_decoding_left_chunks=cfg.num_decoding_left_chunks,
|
||||
simulate_streaming=cfg.simulate_streaming)
|
||||
logger.info("The result_transcripts: " + result_transcripts[0][0])
|
||||
|
||||
def run_test(self):
|
||||
self.resume()
|
||||
try:
|
||||
self.test()
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(-1)
|
||||
|
||||
def setup(self):
|
||||
"""Setup the experiment.
|
||||
"""
|
||||
paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
|
||||
|
||||
#self.setup_output_dir()
|
||||
#self.setup_checkpointer()
|
||||
|
||||
#self.setup_dataloader()
|
||||
self.setup_model()
|
||||
|
||||
self.iteration = 0
|
||||
self.epoch = 0
|
||||
|
||||
def resume(self):
|
||||
"""Resume from the checkpoint at checkpoints in the output
|
||||
directory or load a specified checkpoint.
|
||||
"""
|
||||
params_path = self.args.checkpoint_path + ".pdparams"
|
||||
model_dict = paddle.load(params_path)
|
||||
self.model.set_state_dict(model_dict)
|
||||
|
||||
|
||||
def check(audio_file):
|
||||
logger.info("checking the audio file format......")
|
||||
try:
|
||||
sig, sample_rate = soundfile.read(audio_file)
|
||||
except Exception as e:
|
||||
logger.error(str(e))
|
||||
logger.error(
|
||||
"can not open the wav file, please check the audio file format")
|
||||
sys.exit(-1)
|
||||
logger.info("The sample rate is %d" % sample_rate)
|
||||
assert (sample_rate == 16000)
|
||||
logger.info("The audio file format is right")
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = U2Tester_Hub(config, args)
|
||||
with exp.eval():
|
||||
exp.setup()
|
||||
exp.run_test()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = default_argument_parser()
|
||||
# save asr result to
|
||||
parser.add_argument(
|
||||
"--result_file", type=str, help="path of save the asr result")
|
||||
parser.add_argument(
|
||||
"--audio_file", type=str, help="path of the input audio file")
|
||||
args = parser.parse_args()
|
||||
print_arguments(args, globals())
|
||||
|
||||
if not os.path.isfile(args.audio_file):
|
||||
print("Please input the right audio file path")
|
||||
sys.exit(-1)
|
||||
check(args.audio_file)
|
||||
# https://yaml.org/type/float.html
|
||||
config = get_cfg_defaults()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
if args.dump_config:
|
||||
with open(args.dump_config, 'w') as f:
|
||||
print(config, file=f)
|
||||
|
||||
# Setting for profiling
|
||||
pr = cProfile.Profile()
|
||||
pr.runcall(main, config, args)
|
||||
pr.dump_stats('test.profile')
|
After Width: | Height: | Size: 19 KiB |
After Width: | Height: | Size: 47 KiB |
After Width: | Height: | Size: 335 KiB |
After Width: | Height: | Size: 117 KiB |
After Width: | Height: | Size: 60 KiB |
After Width: | Height: | Size: 33 KiB |
After Width: | Height: | Size: 75 KiB |
After Width: | Height: | Size: 116 KiB |
After Width: | Height: | Size: 69 KiB |
After Width: | Height: | Size: 41 KiB |
After Width: | Height: | Size: 87 KiB |
After Width: | Height: | Size: 131 KiB |
After Width: | Height: | Size: 304 KiB |
After Width: | Height: | Size: 143 KiB |
After Width: | Height: | Size: 361 KiB |
@ -1,7 +0,0 @@
|
||||
myst_parser
|
||||
numpydoc
|
||||
recommonmark>=0.5.0
|
||||
sphinx
|
||||
sphinx-autobuild
|
||||
sphinx-markdown-tables
|
||||
sphinx_rtd_theme
|
@ -0,0 +1,130 @@
|
||||
# Parakeet
|
||||
Parakeet aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It is built on PaddlePaddle dynamic graph and includes many influential TTS models.
|
||||
|
||||
<div align="center">
|
||||
<img src="../../images/logo.png" width=300 /> <br>
|
||||
</div>
|
||||
|
||||
|
||||
## News <img src="../../images/news_icon.png" width="40"/>
|
||||
- Oct-12-2021, Refector examples code.
|
||||
- Oct-12-2021, Parallel WaveGAN with LJSpeech. Check [examples/GANVocoder/parallelwave_gan/ljspeech](./examples/GANVocoder/parallelwave_gan/ljspeech).
|
||||
- Oct-12-2021, FastSpeech2/FastPitch with LJSpeech. Check [examples/fastspeech2/ljspeech](./examples/fastspeech2/ljspeech).
|
||||
- Sep-14-2021, Reconstruction of TransformerTTS. Check [examples/transformer_tts/ljspeech](./examples/transformer_tts/ljspeech).
|
||||
- Aug-31-2021, Chinese Text Frontend. Check [examples/text_frontend](./examples/text_frontend).
|
||||
- Aug-23-2021, FastSpeech2/FastPitch with AISHELL-3. Check [examples/fastspeech2/aishell3](./examples/fastspeech2/aishell3).
|
||||
- Aug-03-2021, FastSpeech2/FastPitch with CSMSC. Check [examples/fastspeech2/baker](./examples/fastspeech2/baker).
|
||||
- Jul-19-2021, SpeedySpeech with CSMSC. Check [examples/speedyspeech/baker](./examples/speedyspeech/baker).
|
||||
- Jul-01-2021, Parallel WaveGAN with CSMSC. Check [examples/GANVocoder/parallelwave_gan/baker](./examples/GANVocoder/parallelwave_gan/baker).
|
||||
- Jul-01-2021, Montreal-Forced-Aligner. Check [examples/use_mfa](./examples/use_mfa).
|
||||
- May-07-2021, Voice Cloning in Chinese. Check [examples/tacotron2_aishell3](./examples/tacotron2_aishell3).
|
||||
|
||||
## Overview
|
||||
|
||||
In order to facilitate exploiting the existing TTS models directly and developing the new ones, Parakeet selects typical models and provides their reference implementations in PaddlePaddle. Further more, Parakeet abstracts the TTS pipeline and standardizes the procedure of data preprocessing, common modules sharing, model configuration, and the process of training and synthesis. The models supported here include Text FrontEnd, end-to-end Acoustic models and Vocoders:
|
||||
|
||||
- Text FrontEnd
|
||||
- Rule based Chinese frontend.
|
||||
|
||||
- Acoustic Models
|
||||
- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558)
|
||||
- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802)
|
||||
- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
|
||||
- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
|
||||
- Vocoders
|
||||
- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
|
||||
- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
|
||||
- Voice Cloning
|
||||
- [Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558v4.pdf)
|
||||
- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)
|
||||
|
||||
## Setup
|
||||
It's difficult to install some dependent libraries for this repo in Windows system, we recommend that you **DO NOT** use Windows system, please use `Linux`.
|
||||
|
||||
Make sure the library `libsndfile1` is installed, e.g., on Ubuntu.
|
||||
|
||||
```bash
|
||||
sudo apt-get install libsndfile1
|
||||
```
|
||||
### Install PaddlePaddle
|
||||
See [install](https://www.paddlepaddle.org.cn/install/quick) for more details. This repo requires PaddlePaddle **2.1.2** or above.
|
||||
|
||||
### Install Parakeet
|
||||
|
||||
```bash
|
||||
git clone https://github.com/PaddlePaddle/Parakeet
|
||||
cd Parakeet
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
If some python dependent packages cannot be installed successfully, you can run the following script first.
|
||||
(replace `python3.6` with your own python version)
|
||||
```bash
|
||||
sudo apt install -y python3.6-dev
|
||||
```
|
||||
|
||||
See [install](https://paddle-parakeet.readthedocs.io/en/latest/install.html) for more details.
|
||||
|
||||
## Examples
|
||||
Entries to the introduction, and the launch of training and synthsis for different example models:
|
||||
|
||||
- [>>> Chinese Text Frontend](./examples/text_frontend)
|
||||
- [>>> FastSpeech2/FastPitch](./examples/fastspeech2)
|
||||
- [>>> Montreal-Forced-Aligner](./examples/use_mfa)
|
||||
- [>>> Parallel WaveGAN](./examples/GANVocoder/parallelwave_gan)
|
||||
- [>>> SpeedySpeech](./examples/speedyspeech)
|
||||
- [>>> Tacotron2_AISHELL3](./examples/tacotron2_aishell3)
|
||||
- [>>> GE2E](./examples/ge2e)
|
||||
- [>>> WaveFlow](./examples/waveflow)
|
||||
- [>>> TransformerTTS](./examples/transformer_tts)
|
||||
- [>>> Tacotron2](./examples/tacotron2)
|
||||
|
||||
## Audio samples
|
||||
### TTS models (Acoustic Model + Neural Vocoder)
|
||||
Check our [website](https://paddleparakeet.readthedocs.io/en/latest/demo.html) for audio sampels.
|
||||
|
||||
## Released Model
|
||||
|
||||
### Acoustic Model
|
||||
|
||||
#### FastSpeech2/FastPitch
|
||||
1. [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_baker_ckpt_0.4.zip)
|
||||
2. [fastspeech2_nosil_aishell3_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_aishell3_ckpt_0.4.zip)
|
||||
3. [fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)
|
||||
|
||||
#### SpeedySpeech
|
||||
1. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
|
||||
|
||||
#### TransformerTTS
|
||||
|
||||
1. [transformer_tts_ljspeech_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/transformer_tts_ljspeech_ckpt_0.4.zip)
|
||||
|
||||
#### Tacotron2
|
||||
|
||||
1. [tacotron2_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3.zip)
|
||||
2. [tacotron2_ljspeech_ckpt_0.3_alternative.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_ckpt_0.3_alternative.zip)
|
||||
|
||||
### Vocoder
|
||||
|
||||
#### WaveFlow
|
||||
|
||||
1. [waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_ljspeech_ckpt_0.3.zip)
|
||||
|
||||
#### Parallel WaveGAN
|
||||
|
||||
1. [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip)
|
||||
2. [pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_ljspeech_ckpt_0.5.zip)
|
||||
|
||||
### Voice Cloning
|
||||
|
||||
#### Tacotron2_AISHELL3
|
||||
|
||||
1. [tacotron2_aishell3_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_aishell3_ckpt_0.3.zip)
|
||||
|
||||
#### GE2E
|
||||
|
||||
1. [ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/ge2e_ckpt_0.3.zip)
|
||||
|
||||
## License
|
||||
|
||||
Parakeet is provided under the [Apache-2.0 license](LICENSE).
|
@ -0,0 +1,583 @@
|
||||
Audio Sample
|
||||
==================
|
||||
|
||||
The main processes of TTS include:
|
||||
|
||||
1. Convert the original text into characters/phonemes, through ``text frontend`` module.
|
||||
|
||||
2. Convert characters/phonemes into acoustic features , such as linear spectrogram, mel spectrogram, LPC features, etc. through ``Acoustic models``.
|
||||
|
||||
3. Convert acoustic features into waveforms through ``Vocoders``.
|
||||
|
||||
When training ``Tacotron2``、``TransformerTTS`` and ``WaveFlow``, we use English single speaker TTS dataset `LJSpeech <https://keithito.com/LJ-Speech-Dataset/>`_ by default. However, when training ``SpeedySpeech``, ``FastSpeech2`` and ``ParallelWaveGAN``, we use Chinese single speaker dataset `CSMSC <https://test.data-baker.com/data/index/source/>`_ by default.
|
||||
|
||||
In the future, ``Parakeet`` will mainly use Chinese TTS datasets for default examples.
|
||||
|
||||
Here, we will display three types of audio samples:
|
||||
|
||||
1. Analysis/synthesis (ground-truth spectrograms + Vocoder)
|
||||
|
||||
2. TTS (Acoustic model + Vocoder)
|
||||
|
||||
3. Chinese TTS with/without text frontend (mainly tone sandhi)
|
||||
|
||||
Analysis/synthesis
|
||||
--------------------------
|
||||
|
||||
Audio samples generated from ground-truth spectrograms with a vocoder.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<b>LJSpeech(English)</b>
|
||||
<br>
|
||||
</br>
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> GT </th>
|
||||
<th align="left"> WaveFlow </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<br>
|
||||
</br>
|
||||
<b>CSMSC(Chinese)</b>
|
||||
<br>
|
||||
</br>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> GT (convert to 24k) </th>
|
||||
<th align="left"> ParallelWaveGAN </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009901.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009902.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009903.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009904.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009905.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009901.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009902.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009903.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009904.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009905.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
</table>
|
||||
|
||||
|
||||
TTS
|
||||
-------------------
|
||||
|
||||
Audio samples generated by a TTS system. Text is first transformed into spectrogram by a text-to-spectrogram model, then the spectrogram is converted into raw audio by a vocoder.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> TransformerTTS + WaveFlow </th>
|
||||
<th align="left"> Tacotron2 + WaveFlow </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> SpeedySpeech + ParallelWaveGAN </th>
|
||||
<th align="left"> FastSpeech2 + ParallelWaveGAN </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
|
||||
|
||||
Chinese TTS with/without text frontend
|
||||
--------------------------------------
|
||||
|
||||
We provide a complete Chinese text frontend module in ``Parakeet``. ``Text Normalization`` and ``G2P`` are the most important modules in text frontend, We assume that the texts are normalized already, and mainly compare ``G2P`` module here.
|
||||
|
||||
We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
|
||||
|
||||
.. raw:: html
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th align="left"> With Text Frontend </th>
|
||||
<th align="left"> Without Text Frontend </th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/010.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
<td>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/001.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/002.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/003.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/004.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/005.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/006.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/007.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/008.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/009.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
<audio controls="controls">
|
||||
<source
|
||||
src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/010.wav"
|
||||
type="audio/wav">
|
||||
Your browser does not support the <code>audio</code> element.
|
||||
</audio>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
|
||||
<table>
|
@ -0,0 +1,45 @@
|
||||
.. parakeet documentation master file, created by
|
||||
sphinx-quickstart on Fri Sep 10 14:22:24 2021.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
Parakeet
|
||||
====================================
|
||||
|
||||
``parakeet`` is a deep learning based text-to-speech toolkit built upon ``paddlepaddle`` framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by `Baidu Research <http://research.baidu.com>`_ and other research groups.
|
||||
|
||||
``parakeet`` mainly consists of components below.
|
||||
|
||||
#. Implementation of models and commonly used neural network layers.
|
||||
#. Dataset abstraction and common data preprocessing pipelines.
|
||||
#. Ready-to-run experiments.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Introduction
|
||||
|
||||
introduction
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Getting started
|
||||
|
||||
install
|
||||
basic_usage
|
||||
advanced_usage
|
||||
cn_text_frontend
|
||||
released_models
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:caption: Demos
|
||||
|
||||
demo
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
@ -0,0 +1,47 @@
|
||||
# Installation
|
||||
## Install PaddlePaddle
|
||||
Parakeet requires PaddlePaddle as its backend. Note that 2.1.2 or newer versions of paddle is required.
|
||||
|
||||
Since paddlepaddle has multiple packages depending on the device (cpu or gpu) and the dependency libraries, it is recommended to install a proper package of paddlepaddle with respect to the device and dependency library versons via `pip`.
|
||||
|
||||
Installing paddlepaddle with conda or build paddlepaddle from source is also supported. Please refer to [PaddlePaddle installation](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/pip/linux-pip.html) for more details.
|
||||
|
||||
Example instruction to install paddlepaddle via pip is listed below.
|
||||
|
||||
### PaddlePaddle with GPU
|
||||
```python
|
||||
# CUDA10.1 的 PaddlePaddle
|
||||
python -m pip install paddlepaddle-gpu==2.1.2.post101 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
|
||||
# CUDA10.2 的 PaddlePaddle
|
||||
python -m pip install paddlepaddle-gpu -i https://mirror.baidu.com/pypi/simple
|
||||
# CUDA11.0 的 PaddlePaddle
|
||||
python -m pip install paddlepaddle-gpu==2.1.2.post110 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
|
||||
# CUDA11.2 的 PaddlePaddle
|
||||
python -m pip install paddlepaddle-gpu==2.1.2.post112 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
|
||||
```
|
||||
### PaddlePaddle with CPU
|
||||
```python
|
||||
python -m pip install paddlepaddle==2.1.2 -i https://mirror.baidu.com/pypi/simple
|
||||
```
|
||||
## Install libsndfile
|
||||
Experimemts in parakeet often involve audio and spectrum processing, thus `librosa` and `soundfile` are required. `soundfile` requires a extra C library `libsndfile`, which is not always handled by pip.
|
||||
|
||||
For Windows and Mac users, `libsndfile` is also installed when installing `soundfile` via pip, but for Linux users, installing `libsndfile` via system package manager is required. Example commands for popular distributions are listed below.
|
||||
```bash
|
||||
# ubuntu, debian
|
||||
sudo apt-get install libsndfile1
|
||||
# centos, fedora
|
||||
sudo yum install libsndfile
|
||||
# openSUSE
|
||||
sudo zypper in libsndfile
|
||||
```
|
||||
For any problem with installtion of soundfile, please refer to [SoundFile](https://pypi.org/project/SoundFile/).
|
||||
## Install Parakeet
|
||||
There are two ways to install parakeet according to the purpose of using it.
|
||||
|
||||
1. If you want to run experiments provided by parakeet or add new models and experiments, it is recommended to clone the project from github (Parakeet), and install it in editable mode.
|
||||
```python
|
||||
git clone https://github.com/PaddlePaddle/Parakeet
|
||||
cd Parakeet
|
||||
pip install -e .
|
||||
```
|
@ -0,0 +1,27 @@
|
||||
# Parakeet - PAddle PARAllel text-to-speech toolKIT
|
||||
|
||||
## What is Parakeet?
|
||||
Parakeet is a deep learning based text-to-speech toolkit built upon paddlepaddle framework. It aims to provide a flexible, efficient and state-of-the-art text-to-speech toolkit for the open-source community. It includes many influential TTS models proposed by Baidu Research and other research groups.
|
||||
|
||||
## What can Parakeet do?
|
||||
Parakeet mainly consists of components below:
|
||||
- Implementation of models and commonly used neural network layers.
|
||||
- Dataset abstraction and common data preprocessing pipelines.
|
||||
- Ready-to-run experiments.
|
||||
|
||||
Parakeet provides you with a complete TTS pipeline, including:
|
||||
- Text FrontEnd
|
||||
- Rule based Chinese frontend.
|
||||
- Acoustic Models
|
||||
- FastSpeech2
|
||||
- SpeedySpeech
|
||||
- TransformerTTS
|
||||
- Tacotron2
|
||||
- Vocoders
|
||||
- Parallel WaveGAN
|
||||
- WaveFlow
|
||||
- Voice Cloning
|
||||
- Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis
|
||||
- GE2E
|
||||
|
||||
Parakeet helps you to train TTS models with simple commands.
|
@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ $# != 3 ];then
|
||||
echo "usage: ${0} config_path ckpt_path_prefix audio_file"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
|
||||
echo "using $ngpu gpus..."
|
||||
|
||||
config_path=$1
|
||||
ckpt_prefix=$2
|
||||
audio_file=$3
|
||||
|
||||
chunk_mode=false
|
||||
if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
|
||||
chunk_mode=true
|
||||
fi
|
||||
|
||||
# download language model
|
||||
#bash local/download_lm_ch.sh
|
||||
#if [ $? -ne 0 ]; then
|
||||
# exit 1
|
||||
#fi
|
||||
|
||||
|
||||
|
||||
for type in attention_rescoring; do
|
||||
echo "decoding ${type}"
|
||||
batch_size=1
|
||||
output_dir=${ckpt_prefix}
|
||||
mkdir -p ${output_dir}
|
||||
python3 -u ${BIN_DIR}/test_hub.py \
|
||||
--nproc ${ngpu} \
|
||||
--config ${config_path} \
|
||||
--result_file ${output_dir}/${type}.rsl \
|
||||
--checkpoint_path ${ckpt_prefix} \
|
||||
--opts decoding.decoding_method ${type} \
|
||||
--opts decoding.batch_size ${batch_size} \
|
||||
--audio_file ${audio_file}
|
||||
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed in evaluation!"
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
exit 0
|
@ -0,0 +1,4 @@
|
||||
# Aishell3
|
||||
|
||||
* tts0 - fastspeech2
|
||||
* vc0 - tactron2 voice clone
|
@ -0,0 +1,88 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from paddle.io import Dataset
|
||||
from parakeet.frontend import Vocab
|
||||
from parakeet.data import batch_text_id, batch_spec
|
||||
|
||||
from preprocess_transcription import _phones, _tones
|
||||
|
||||
voc_phones = Vocab(sorted(list(_phones)))
|
||||
print("vocab_phones:\n", voc_phones)
|
||||
voc_tones = Vocab(sorted(list(_tones)))
|
||||
print("vocab_tones:\n", voc_tones)
|
||||
|
||||
|
||||
class AiShell3(Dataset):
|
||||
"""Processed AiShell3 dataset."""
|
||||
|
||||
def __init__(self, root):
|
||||
super().__init__()
|
||||
self.root = Path(root).expanduser()
|
||||
self.embed_dir = self.root / "embed"
|
||||
self.mel_dir = self.root / "mel"
|
||||
|
||||
with open(self.root / "metadata.pickle", 'rb') as f:
|
||||
self.records = pickle.load(f)
|
||||
|
||||
def __getitem__(self, index):
|
||||
metadatum = self.records[index]
|
||||
sentence_id = metadatum["sentence_id"]
|
||||
speaker_id = sentence_id[:7]
|
||||
phones = metadatum["phones"]
|
||||
tones = metadatum["tones"]
|
||||
phones = np.array(
|
||||
[voc_phones.lookup(item) for item in phones], dtype=np.int64)
|
||||
tones = np.array(
|
||||
[voc_tones.lookup(item) for item in tones], dtype=np.int64)
|
||||
mel = np.load(str(self.mel_dir / speaker_id / (sentence_id + ".npy")))
|
||||
embed = np.load(
|
||||
str(self.embed_dir / speaker_id / (sentence_id + ".npy")))
|
||||
return phones, tones, mel, embed
|
||||
|
||||
def __len__(self):
|
||||
return len(self.records)
|
||||
|
||||
|
||||
def collate_aishell3_examples(examples):
|
||||
phones, tones, mel, embed = list(zip(*examples))
|
||||
|
||||
text_lengths = np.array([item.shape[0] for item in phones], dtype=np.int64)
|
||||
spec_lengths = np.array([item.shape[1] for item in mel], dtype=np.int64)
|
||||
T_dec = np.max(spec_lengths)
|
||||
stop_tokens = (
|
||||
np.arange(T_dec) >= np.expand_dims(spec_lengths, -1)).astype(np.float32)
|
||||
phones, _ = batch_text_id(phones)
|
||||
tones, _ = batch_text_id(tones)
|
||||
mel, _ = batch_spec(mel)
|
||||
mel = np.transpose(mel, (0, 2, 1))
|
||||
embed = np.stack(embed)
|
||||
# 7 fields
|
||||
# (B, T), (B, T), (B, T, C), (B, C), (B,), (B,), (B, T)
|
||||
return phones, tones, mel, embed, text_lengths, spec_lengths, stop_tokens
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset = AiShell3("~/datasets/aishell3/train")
|
||||
example = dataset[0]
|
||||
|
||||
examples = [dataset[i] for i in range(10)]
|
||||
batch = collate_aishell3_examples(examples)
|
||||
|
||||
for field in batch:
|
||||
print(field.shape, field.dtype)
|
@ -0,0 +1,39 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List, Tuple
|
||||
from pypinyin import lazy_pinyin, Style
|
||||
from preprocess_transcription import split_syllable
|
||||
|
||||
|
||||
def convert_to_pinyin(text: str) -> List[str]:
|
||||
"""convert text into list of syllables, other characters that are not chinese, thus
|
||||
cannot be converted to pinyin are splited.
|
||||
"""
|
||||
syllables = lazy_pinyin(
|
||||
text, style=Style.TONE3, neutral_tone_with_five=True)
|
||||
return syllables
|
||||
|
||||
|
||||
def convert_sentence(text: str) -> List[Tuple[str]]:
|
||||
"""convert a sentence into two list: phones and tones"""
|
||||
syllables = convert_to_pinyin(text)
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in syllables:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
|
||||
return phones, tones
|
@ -0,0 +1,82 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from yacs.config import CfgNode as CN
|
||||
|
||||
_C = CN()
|
||||
_C.data = CN(
|
||||
dict(
|
||||
batch_size=32, # batch size
|
||||
valid_size=64, # the first N examples are reserved for validation
|
||||
sample_rate=22050, # Hz, sample rate
|
||||
n_fft=1024, # fft frame size
|
||||
win_length=1024, # window size
|
||||
hop_length=256, # hop size between ajacent frame
|
||||
fmax=8000, # Hz, max frequency when converting to mel
|
||||
fmin=0, # Hz, min frequency when converting to mel
|
||||
d_mels=80, # mel bands
|
||||
padding_idx=0, # text embedding's padding index
|
||||
))
|
||||
|
||||
_C.model = CN(
|
||||
dict(
|
||||
vocab_size=70,
|
||||
n_tones=10,
|
||||
reduction_factor=1, # reduction factor
|
||||
d_encoder=512, # embedding & encoder's internal size
|
||||
encoder_conv_layers=3, # number of conv layer in tacotron2 encoder
|
||||
encoder_kernel_size=5, # kernel size of conv layers in tacotron2 encoder
|
||||
d_prenet=256, # hidden size of decoder prenet
|
||||
# hidden size of the first rnn layer in tacotron2 decoder
|
||||
d_attention_rnn=1024,
|
||||
# hidden size of the second rnn layer in tacotron2 decoder
|
||||
d_decoder_rnn=1024,
|
||||
d_attention=128, # hidden size of decoder location linear layer
|
||||
attention_filters=32, # number of filter in decoder location conv layer
|
||||
attention_kernel_size=31, # kernel size of decoder location conv layer
|
||||
d_postnet=512, # hidden size of decoder postnet
|
||||
postnet_kernel_size=5, # kernel size of conv layers in postnet
|
||||
postnet_conv_layers=5, # number of conv layer in decoder postnet
|
||||
p_encoder_dropout=0.5, # droput probability in encoder
|
||||
p_prenet_dropout=0.5, # droput probability in decoder prenet
|
||||
|
||||
# droput probability of first rnn layer in decoder
|
||||
p_attention_dropout=0.1,
|
||||
# droput probability of second rnn layer in decoder
|
||||
p_decoder_dropout=0.1,
|
||||
p_postnet_dropout=0.5, # droput probability in decoder postnet
|
||||
guided_attention_loss_sigma=0.2,
|
||||
d_global_condition=256,
|
||||
|
||||
# whether to use a classifier to predict stop probability
|
||||
use_stop_token=False,
|
||||
# whether to use guided attention loss in training
|
||||
use_guided_attention_loss=True, ))
|
||||
|
||||
_C.training = CN(
|
||||
dict(
|
||||
lr=1e-3, # learning rate
|
||||
weight_decay=1e-6, # the coeff of weight decay
|
||||
grad_clip_thresh=1.0, # the clip norm of grad clip.
|
||||
valid_interval=1000, # validation
|
||||
save_interval=1000, # checkpoint
|
||||
max_iteration=500000, # max iteration to train
|
||||
))
|
||||
|
||||
|
||||
def get_cfg_defaults():
|
||||
"""Get a yacs CfgNode object with default values for my_project."""
|
||||
# Return a clone so that the defaults will not be altered
|
||||
# This is for the "local variable" use pattern
|
||||
return _C.clone()
|
@ -0,0 +1,96 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import multiprocessing as mp
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from parakeet.audio import AudioProcessor
|
||||
from parakeet.audio.spec_normalizer import NormalizerBase, LogMagnitude
|
||||
|
||||
import tqdm
|
||||
|
||||
from config import get_cfg_defaults
|
||||
|
||||
|
||||
def extract_mel(fname: Path,
|
||||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
p: AudioProcessor,
|
||||
n: NormalizerBase):
|
||||
relative_path = fname.relative_to(input_dir)
|
||||
out_path = (output_dir / relative_path).with_suffix(".npy")
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
wav = p.read_wav(fname)
|
||||
mel = p.mel_spectrogram(wav)
|
||||
mel = n.transform(mel)
|
||||
np.save(out_path, mel)
|
||||
|
||||
|
||||
def extract_mel_multispeaker(config, input_dir, output_dir, extension=".wav"):
|
||||
input_dir = Path(input_dir).expanduser()
|
||||
fnames = list(input_dir.rglob(f"*{extension}"))
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
p = AudioProcessor(config.sample_rate, config.n_fft, config.win_length,
|
||||
config.hop_length, config.n_mels, config.fmin,
|
||||
config.fmax)
|
||||
n = LogMagnitude(1e-5)
|
||||
|
||||
func = partial(
|
||||
extract_mel, input_dir=input_dir, output_dir=output_dir, p=p, n=n)
|
||||
|
||||
with mp.Pool(16) as pool:
|
||||
list(
|
||||
tqdm.tqdm(
|
||||
pool.imap(func, fnames), total=len(fnames), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract mel spectrogram from processed wav in AiShell3 training dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--config",
|
||||
type=str,
|
||||
help="yaml config file to overwrite the default config")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the processed wav folder")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/mel",
|
||||
help="path of the folder to save mel spectrograms")
|
||||
parser.add_argument(
|
||||
"--opts",
|
||||
nargs=argparse.REMAINDER,
|
||||
help="options to overwrite --config file and the default config, passing in KEY VALUE pairs"
|
||||
)
|
||||
default_config = get_cfg_defaults()
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
default_config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
default_config.merge_from_list(args.opts)
|
||||
default_config.freeze()
|
||||
audio_config = default_config.data
|
||||
|
||||
extract_mel_multispeaker(audio_config, args.input, args.output)
|
After Width: | Height: | Size: 221 KiB |
After Width: | Height: | Size: 550 KiB |
After Width: | Height: | Size: 514 KiB |
@ -0,0 +1,258 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import re
|
||||
import pickle
|
||||
|
||||
import yaml
|
||||
import tqdm
|
||||
|
||||
zh_pattern = re.compile("[\u4e00-\u9fa5]")
|
||||
|
||||
_tones = {'<pad>', '<s>', '</s>', '0', '1', '2', '3', '4', '5'}
|
||||
|
||||
_pauses = {'%', '$'}
|
||||
|
||||
_initials = {
|
||||
'b',
|
||||
'p',
|
||||
'm',
|
||||
'f',
|
||||
'd',
|
||||
't',
|
||||
'n',
|
||||
'l',
|
||||
'g',
|
||||
'k',
|
||||
'h',
|
||||
'j',
|
||||
'q',
|
||||
'x',
|
||||
'zh',
|
||||
'ch',
|
||||
'sh',
|
||||
'r',
|
||||
'z',
|
||||
'c',
|
||||
's',
|
||||
}
|
||||
|
||||
_finals = {
|
||||
'ii',
|
||||
'iii',
|
||||
'a',
|
||||
'o',
|
||||
'e',
|
||||
'ea',
|
||||
'ai',
|
||||
'ei',
|
||||
'ao',
|
||||
'ou',
|
||||
'an',
|
||||
'en',
|
||||
'ang',
|
||||
'eng',
|
||||
'er',
|
||||
'i',
|
||||
'ia',
|
||||
'io',
|
||||
'ie',
|
||||
'iai',
|
||||
'iao',
|
||||
'iou',
|
||||
'ian',
|
||||
'ien',
|
||||
'iang',
|
||||
'ieng',
|
||||
'u',
|
||||
'ua',
|
||||
'uo',
|
||||
'uai',
|
||||
'uei',
|
||||
'uan',
|
||||
'uen',
|
||||
'uang',
|
||||
'ueng',
|
||||
'v',
|
||||
've',
|
||||
'van',
|
||||
'ven',
|
||||
'veng',
|
||||
}
|
||||
|
||||
_ernized_symbol = {'&r'}
|
||||
|
||||
_specials = {'<pad>', '<unk>', '<s>', '</s>'}
|
||||
|
||||
_phones = _initials | _finals | _ernized_symbol | _specials | _pauses
|
||||
|
||||
|
||||
def is_zh(word):
|
||||
global zh_pattern
|
||||
match = zh_pattern.search(word)
|
||||
return match is not None
|
||||
|
||||
|
||||
def ernized(syllable):
|
||||
return syllable[:2] != "er" and syllable[-2] == 'r'
|
||||
|
||||
|
||||
def convert(syllable):
|
||||
# expansion of o -> uo
|
||||
syllable = re.sub(r"([bpmf])o$", r"\1uo", syllable)
|
||||
# syllable = syllable.replace("bo", "buo").replace("po", "puo").replace("mo", "muo").replace("fo", "fuo")
|
||||
# expansion for iong, ong
|
||||
syllable = syllable.replace("iong", "veng").replace("ong", "ueng")
|
||||
|
||||
# expansion for ing, in
|
||||
syllable = syllable.replace("ing", "ieng").replace("in", "ien")
|
||||
|
||||
# expansion for un, ui, iu
|
||||
syllable = syllable.replace("un", "uen").replace("ui",
|
||||
"uei").replace("iu", "iou")
|
||||
|
||||
# rule for variants of i
|
||||
syllable = syllable.replace("zi", "zii").replace("ci", "cii").replace("si", "sii")\
|
||||
.replace("zhi", "zhiii").replace("chi", "chiii").replace("shi", "shiii")\
|
||||
.replace("ri", "riii")
|
||||
|
||||
# rule for y preceding i, u
|
||||
syllable = syllable.replace("yi", "i").replace("yu", "v").replace("y", "i")
|
||||
|
||||
# rule for w
|
||||
syllable = syllable.replace("wu", "u").replace("w", "u")
|
||||
|
||||
# rule for v following j, q, x
|
||||
syllable = syllable.replace("ju", "jv").replace("qu",
|
||||
"qv").replace("xu", "xv")
|
||||
|
||||
return syllable
|
||||
|
||||
|
||||
def split_syllable(syllable: str):
|
||||
"""Split a syllable in pinyin into a list of phones and a list of tones.
|
||||
Initials have no tone, represented by '0', while finals have tones from
|
||||
'1,2,3,4,5'.
|
||||
|
||||
e.g.
|
||||
|
||||
zhang -> ['zh', 'ang'], ['0', '1']
|
||||
"""
|
||||
if syllable in _pauses:
|
||||
# syllable, tone
|
||||
return [syllable], ['0']
|
||||
|
||||
tone = syllable[-1]
|
||||
syllable = convert(syllable[:-1])
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
|
||||
global _initials
|
||||
if syllable[:2] in _initials:
|
||||
phones.append(syllable[:2])
|
||||
tones.append('0')
|
||||
phones.append(syllable[2:])
|
||||
tones.append(tone)
|
||||
elif syllable[0] in _initials:
|
||||
phones.append(syllable[0])
|
||||
tones.append('0')
|
||||
phones.append(syllable[1:])
|
||||
tones.append(tone)
|
||||
else:
|
||||
phones.append(syllable)
|
||||
tones.append(tone)
|
||||
return phones, tones
|
||||
|
||||
|
||||
def load_aishell3_transcription(line: str):
|
||||
sentence_id, pinyin, text = line.strip().split("|")
|
||||
syllables = pinyin.strip().split()
|
||||
|
||||
results = []
|
||||
|
||||
for syllable in syllables:
|
||||
if syllable in _pauses:
|
||||
results.append(syllable)
|
||||
elif not ernized(syllable):
|
||||
results.append(syllable)
|
||||
else:
|
||||
results.append(syllable[:-2] + syllable[-1])
|
||||
results.append('&r5')
|
||||
|
||||
phones = []
|
||||
tones = []
|
||||
for syllable in results:
|
||||
p, t = split_syllable(syllable)
|
||||
phones.extend(p)
|
||||
tones.extend(t)
|
||||
for p in phones:
|
||||
assert p in _phones, p
|
||||
return {
|
||||
"sentence_id": sentence_id,
|
||||
"text": text,
|
||||
"syllables": results,
|
||||
"phones": phones,
|
||||
"tones": tones
|
||||
}
|
||||
|
||||
|
||||
def process_aishell3(dataset_root, output_dir):
|
||||
dataset_root = Path(dataset_root).expanduser()
|
||||
output_dir = Path(output_dir).expanduser()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
prosody_label_path = dataset_root / "label_train-set.txt"
|
||||
with open(prosody_label_path, 'rt') as f:
|
||||
lines = [line.strip() for line in f]
|
||||
|
||||
records = lines[5:]
|
||||
|
||||
processed_records = []
|
||||
for record in tqdm.tqdm(records):
|
||||
new_record = load_aishell3_transcription(record)
|
||||
processed_records.append(new_record)
|
||||
print(new_record)
|
||||
|
||||
with open(output_dir / "metadata.pickle", 'wb') as f:
|
||||
pickle.dump(processed_records, f)
|
||||
|
||||
with open(output_dir / "metadata.yaml", 'wt', encoding="utf-8") as f:
|
||||
yaml.safe_dump(
|
||||
processed_records, f, default_flow_style=None, allow_unicode=True)
|
||||
|
||||
print("metadata done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess transcription of AiShell3 and save them in a compact file(yaml and pickle)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train",
|
||||
help="path of the training dataset,(contains a label_train-set.txt).")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
help="the directory to save the processed transcription."
|
||||
"If not provided, it would be the same as the input.")
|
||||
args = parser.parse_args()
|
||||
if args.output is None:
|
||||
args.output = args.input
|
||||
|
||||
process_aishell3(args.input, args.output)
|
@ -0,0 +1,95 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from multiprocessing import Pool
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import librosa
|
||||
import soundfile as sf
|
||||
from tqdm import tqdm
|
||||
from praatio import tgio
|
||||
|
||||
|
||||
def get_valid_part(fpath):
|
||||
f = tgio.openTextgrid(fpath)
|
||||
|
||||
start = 0
|
||||
phone_entry_list = f.tierDict['phones'].entryList
|
||||
first_entry = phone_entry_list[0]
|
||||
if first_entry.label == "sil":
|
||||
start = first_entry.end
|
||||
|
||||
last_entry = phone_entry_list[-1]
|
||||
if last_entry.label == "sp":
|
||||
end = last_entry.start
|
||||
else:
|
||||
end = last_entry.end
|
||||
return start, end
|
||||
|
||||
|
||||
def process_utterance(fpath, source_dir, target_dir, alignment_dir):
|
||||
rel_path = fpath.relative_to(source_dir)
|
||||
opath = target_dir / rel_path
|
||||
apath = (alignment_dir / rel_path).with_suffix(".TextGrid")
|
||||
opath.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
start, end = get_valid_part(apath)
|
||||
wav, _ = librosa.load(fpath, sr=22050, offset=start, duration=end - start)
|
||||
normalized_wav = wav / np.max(wav) * 0.999
|
||||
sf.write(opath, normalized_wav, samplerate=22050, subtype='PCM_16')
|
||||
# print(f"{fpath} => {opath}")
|
||||
|
||||
|
||||
def preprocess_aishell3(source_dir, target_dir, alignment_dir):
|
||||
source_dir = Path(source_dir).expanduser()
|
||||
target_dir = Path(target_dir).expanduser()
|
||||
alignment_dir = Path(alignment_dir).expanduser()
|
||||
|
||||
wav_paths = list(source_dir.rglob("*.wav"))
|
||||
print(f"there are {len(wav_paths)} audio files in total")
|
||||
fx = partial(
|
||||
process_utterance,
|
||||
source_dir=source_dir,
|
||||
target_dir=target_dir,
|
||||
alignment_dir=alignment_dir)
|
||||
with Pool(16) as p:
|
||||
list(
|
||||
tqdm(p.imap(fx, wav_paths), total=len(wav_paths), unit="utterance"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Process audio in AiShell3, trim silence according to the alignment "
|
||||
"files generated by MFA, and normalize volume by peak.")
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/wav",
|
||||
help="path of the original audio folder in aishell3.")
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/normalized_wav",
|
||||
help="path of the folder to save the processed audio files.")
|
||||
parser.add_argument(
|
||||
"--alignment",
|
||||
type=str,
|
||||
default="~/datasets/aishell3/train/alignment",
|
||||
help="path of the alignment files.")
|
||||
args = parser.parse_args()
|
||||
|
||||
preprocess_aishell3(args.input, args.output, args.alignment)
|
@ -0,0 +1,262 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
import numpy as np
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
import paddle
|
||||
from paddle import distributed as dist
|
||||
from paddle.io import DataLoader, DistributedBatchSampler
|
||||
|
||||
from parakeet.data import dataset
|
||||
from parakeet.training.cli import default_argument_parser
|
||||
from parakeet.training.experiment import ExperimentBase
|
||||
from parakeet.utils import display, mp_tools
|
||||
from parakeet.models.tacotron2 import Tacotron2, Tacotron2Loss
|
||||
|
||||
from config import get_cfg_defaults
|
||||
from aishell3 import AiShell3, collate_aishell3_examples
|
||||
|
||||
|
||||
class Experiment(ExperimentBase):
|
||||
def compute_losses(self, inputs, outputs):
|
||||
texts, tones, mel_targets, utterance_embeds, text_lens, output_lens, stop_tokens = inputs
|
||||
|
||||
mel_outputs = outputs["mel_output"]
|
||||
mel_outputs_postnet = outputs["mel_outputs_postnet"]
|
||||
alignments = outputs["alignments"]
|
||||
|
||||
losses = self.criterion(mel_outputs, mel_outputs_postnet, mel_targets,
|
||||
alignments, output_lens, text_lens)
|
||||
return losses
|
||||
|
||||
def train_batch(self):
|
||||
start = time.time()
|
||||
batch = self.read_batch()
|
||||
data_loader_time = time.time() - start
|
||||
|
||||
self.optimizer.clear_grad()
|
||||
self.model.train()
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
loss = losses["loss"]
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
iteration_time = time.time() - start
|
||||
|
||||
losses_np = {k: float(v) for k, v in losses.items()}
|
||||
# logging
|
||||
msg = "Rank: {}, ".format(dist.get_rank())
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += "time: {:>.3f}s/{:>.3f}s, ".format(data_loader_time,
|
||||
iteration_time)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in losses_np.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
if dist.get_rank() == 0:
|
||||
for key, value in losses_np.items():
|
||||
self.visualizer.add_scalar(f"train_loss/{key}", value,
|
||||
self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def valid(self):
|
||||
valid_losses = defaultdict(list)
|
||||
for i, batch in enumerate(self.valid_loader):
|
||||
texts, tones, mels, utterance_embeds, text_lens, output_lens, stop_tokens = batch
|
||||
outputs = self.model(
|
||||
texts,
|
||||
text_lens,
|
||||
mels,
|
||||
output_lens,
|
||||
tones=tones,
|
||||
global_condition=utterance_embeds)
|
||||
losses = self.compute_losses(batch, outputs)
|
||||
for key, value in losses.items():
|
||||
valid_losses[key].append(float(value))
|
||||
|
||||
attention_weights = outputs["alignments"]
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_alignments",
|
||||
display.plot_alignment(attention_weights[0].numpy().T),
|
||||
self.iteration)
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_target_spectrogram",
|
||||
display.plot_spectrogram(mels[0].numpy().T), self.iteration)
|
||||
mel_pred = outputs['mel_outputs_postnet']
|
||||
self.visualizer.add_figure(
|
||||
f"valid_sentence_{i}_predicted_spectrogram",
|
||||
display.plot_spectrogram(mel_pred[0].numpy().T), self.iteration)
|
||||
|
||||
# write visual log
|
||||
valid_losses = {k: np.mean(v) for k, v in valid_losses.items()}
|
||||
|
||||
# logging
|
||||
msg = "Valid: "
|
||||
msg += "step: {}, ".format(self.iteration)
|
||||
msg += ', '.join('{}: {:>.6f}'.format(k, v)
|
||||
for k, v in valid_losses.items())
|
||||
self.logger.info(msg)
|
||||
|
||||
for key, value in valid_losses.items():
|
||||
self.visualizer.add_scalar(f"valid/{key}", value, self.iteration)
|
||||
|
||||
@mp_tools.rank_zero_only
|
||||
@paddle.no_grad()
|
||||
def eval(self):
|
||||
"""Evaluation of Tacotron2 in autoregressive manner."""
|
||||
self.model.eval()
|
||||
mel_dir = Path(self.output_dir / ("eval_{}".format(self.iteration)))
|
||||
mel_dir.mkdir(parents=True, exist_ok=True)
|
||||
for i, batch in enumerate(self.test_loader):
|
||||
texts, tones, mels, utterance_embeds, *_ = batch
|
||||
outputs = self.model.infer(
|
||||
texts, tones=tones, global_condition=utterance_embeds)
|
||||
|
||||
display.plot_alignment(outputs["alignments"][0].numpy().T)
|
||||
plt.savefig(mel_dir / f"sentence_{i}.png")
|
||||
plt.close()
|
||||
np.save(mel_dir / f"sentence_{i}",
|
||||
outputs["mel_outputs_postnet"][0].numpy().T)
|
||||
print(f"sentence_{i}")
|
||||
|
||||
def setup_model(self):
|
||||
config = self.config
|
||||
model = Tacotron2(
|
||||
vocab_size=config.model.vocab_size,
|
||||
n_tones=config.model.n_tones,
|
||||
d_mels=config.data.d_mels,
|
||||
d_encoder=config.model.d_encoder,
|
||||
encoder_conv_layers=config.model.encoder_conv_layers,
|
||||
encoder_kernel_size=config.model.encoder_kernel_size,
|
||||
d_prenet=config.model.d_prenet,
|
||||
d_attention_rnn=config.model.d_attention_rnn,
|
||||
d_decoder_rnn=config.model.d_decoder_rnn,
|
||||
attention_filters=config.model.attention_filters,
|
||||
attention_kernel_size=config.model.attention_kernel_size,
|
||||
d_attention=config.model.d_attention,
|
||||
d_postnet=config.model.d_postnet,
|
||||
postnet_kernel_size=config.model.postnet_kernel_size,
|
||||
postnet_conv_layers=config.model.postnet_conv_layers,
|
||||
reduction_factor=config.model.reduction_factor,
|
||||
p_encoder_dropout=config.model.p_encoder_dropout,
|
||||
p_prenet_dropout=config.model.p_prenet_dropout,
|
||||
p_attention_dropout=config.model.p_attention_dropout,
|
||||
p_decoder_dropout=config.model.p_decoder_dropout,
|
||||
p_postnet_dropout=config.model.p_postnet_dropout,
|
||||
d_global_condition=config.model.d_global_condition,
|
||||
use_stop_token=config.model.use_stop_token, )
|
||||
|
||||
if self.parallel:
|
||||
model = paddle.DataParallel(model)
|
||||
|
||||
grad_clip = paddle.nn.ClipGradByGlobalNorm(
|
||||
config.training.grad_clip_thresh)
|
||||
optimizer = paddle.optimizer.Adam(
|
||||
learning_rate=config.training.lr,
|
||||
parameters=model.parameters(),
|
||||
weight_decay=paddle.regularizer.L2Decay(
|
||||
config.training.weight_decay),
|
||||
grad_clip=grad_clip)
|
||||
criterion = Tacotron2Loss(
|
||||
use_stop_token_loss=config.model.use_stop_token,
|
||||
use_guided_attention_loss=config.model.use_guided_attention_loss,
|
||||
sigma=config.model.guided_attention_loss_sigma)
|
||||
self.model = model
|
||||
self.optimizer = optimizer
|
||||
self.criterion = criterion
|
||||
|
||||
def setup_dataloader(self):
|
||||
args = self.args
|
||||
config = self.config
|
||||
ljspeech_dataset = AiShell3(args.data)
|
||||
|
||||
valid_set, train_set = dataset.split(ljspeech_dataset,
|
||||
config.data.valid_size)
|
||||
batch_fn = collate_aishell3_examples
|
||||
|
||||
if not self.parallel:
|
||||
self.train_loader = DataLoader(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True,
|
||||
collate_fn=batch_fn)
|
||||
else:
|
||||
sampler = DistributedBatchSampler(
|
||||
train_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=True,
|
||||
drop_last=True)
|
||||
self.train_loader = DataLoader(
|
||||
train_set, batch_sampler=sampler, collate_fn=batch_fn)
|
||||
|
||||
self.valid_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=config.data.batch_size,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
self.test_loader = DataLoader(
|
||||
valid_set,
|
||||
batch_size=1,
|
||||
shuffle=False,
|
||||
drop_last=False,
|
||||
collate_fn=batch_fn)
|
||||
|
||||
|
||||
def main_sp(config, args):
|
||||
exp = Experiment(config, args)
|
||||
exp.setup()
|
||||
exp.resume_or_load()
|
||||
if not args.test:
|
||||
exp.run()
|
||||
else:
|
||||
exp.eval()
|
||||
|
||||
|
||||
def main(config, args):
|
||||
if args.nprocs > 1 and args.device == "gpu":
|
||||
dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
|
||||
else:
|
||||
main_sp(config, args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
config = get_cfg_defaults()
|
||||
parser = default_argument_parser()
|
||||
parser.add_argument("--test", action="store_true")
|
||||
args = parser.parse_args()
|
||||
if args.config:
|
||||
config.merge_from_file(args.config)
|
||||
if args.opts:
|
||||
config.merge_from_list(args.opts)
|
||||
config.freeze()
|
||||
print(config)
|
||||
print(args)
|
||||
|
||||
main(config, args)
|
@ -0,0 +1,226 @@
|
||||
# Speedyspeech with CSMSC
|
||||
|
||||
This example contains code used to train a [Speedyspeech](http://arxiv.org/abs/2008.03802) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html). NOTE that we only implement the student part of the Speedyspeech model. The ground truth alignment used to train the model is extracted from the dataset using [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner).
|
||||
|
||||
## Dataset
|
||||
### Download and Extract the datasaet
|
||||
Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
|
||||
|
||||
### Get MFA result of CSMSC and Extract it
|
||||
We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for SPEEDYSPEECH.
|
||||
You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your own MFA model reference to [use_mfa example](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/use_mfa) of our repo.
|
||||
|
||||
## Preprocess the dataset
|
||||
Assume the path to the dataset is `~/datasets/BZNSYP`.
|
||||
Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
|
||||
Run the command below to preprocess the dataset.
|
||||
```bash
|
||||
./preprocess.sh
|
||||
```
|
||||
When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
|
||||
|
||||
```text
|
||||
dump
|
||||
├── dev
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
├── test
|
||||
│ ├── norm
|
||||
│ └── raw
|
||||
└── train
|
||||
├── norm
|
||||
├── raw
|
||||
└── feats_stats.npy
|
||||
```
|
||||
|
||||
The dataset is split into 3 parts, namely `train`, `dev` and `test`, each of which contains a `norm` and `raw` sub folder. The raw folder contains log magnitude of mel spectrogram of each utterances, while the norm folder contains normalized spectrogram. The statistics used to normalize the spectrogram is computed from the training set, which is located in `dump/train/feats_stats.npy`.
|
||||
|
||||
Also there is a `metadata.jsonl` in each subfolder. It is a table-like file which contains phones, tones, durations, path of spectrogram, and id of each utterance.
|
||||
|
||||
## Train the model
|
||||
`./run.sh` calls `../train.py`.
|
||||
```bash
|
||||
./run.sh
|
||||
```
|
||||
Here's the complete help message.
|
||||
|
||||
```text
|
||||
usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
|
||||
[--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--device DEVICE] [--nprocs NPROCS] [--verbose VERBOSE]
|
||||
[--use-relative-path USE_RELATIVE_PATH]
|
||||
[--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
|
||||
|
||||
Train a Speedyspeech model with sigle speaker dataset.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--config CONFIG config file.
|
||||
--train-metadata TRAIN_METADATA
|
||||
training data.
|
||||
--dev-metadata DEV_METADATA
|
||||
dev data.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir.
|
||||
--device DEVICE device type to use.
|
||||
--nprocs NPROCS number of processes.
|
||||
--verbose VERBOSE verbose.
|
||||
--use-relative-path USE_RELATIVE_PATH
|
||||
whether use relative path in metadata
|
||||
--phones-dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
--tones-dict TONES_DICT
|
||||
tone vocabulary file.
|
||||
```
|
||||
|
||||
1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
|
||||
2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
|
||||
3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are save in `checkpoints/` inside this directory.
|
||||
4. `--device` is the type of the device to run the experiment, 'cpu' or 'gpu' are supported.
|
||||
5. `--nprocs` is the number of processes to run in parallel, note that nprocs > 1 is only supported when `--device` is 'gpu'.
|
||||
6. `--phones-dict` is the path of the phone vocabulary file.
|
||||
7. `--tones-dict` is the path of the tone vocabulary file.
|
||||
|
||||
## Pretrained Model
|
||||
Pretrained SpeedySpeech model with no silence in the edge of audios. [speedyspeech_nosil_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/speedyspeech_nosil_baker_ckpt_0.5.zip)
|
||||
|
||||
SpeedySpeech checkpoint contains files listed below.
|
||||
```text
|
||||
speedyspeech_nosil_baker_ckpt_0.5
|
||||
├── default.yaml # default config used to train speedyspeech
|
||||
├── feats_stats.npy # statistics used to normalize spectrogram when training speedyspeech
|
||||
├── phone_id_map.txt # phone vocabulary file when training speedyspeech
|
||||
├── snapshot_iter_11400.pdz # model parameters and optimizer states
|
||||
└── tone_id_map.txt # tone vocabulary file when training speedyspeech
|
||||
```
|
||||
|
||||
## Synthesize
|
||||
We use [parallel wavegan](https://github.com/PaddlePaddle/Parakeet/tree/develop/examples/parallelwave_gan/baker) as the neural vocoder.
|
||||
Download pretrained parallel wavegan model from [pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/pwg_baker_ckpt_0.4.zip) and unzip it.
|
||||
```bash
|
||||
unzip pwg_baker_ckpt_0.4.zip
|
||||
```
|
||||
Parallel WaveGAN checkpoint contains files listed below.
|
||||
```text
|
||||
pwg_baker_ckpt_0.4
|
||||
├── pwg_default.yaml # default config used to train parallel wavegan
|
||||
├── pwg_snapshot_iter_400000.pdz # model parameters of parallel wavegan
|
||||
└── pwg_stats.npy # statistics used to normalize spectrogram when training parallel wavegan
|
||||
```
|
||||
`synthesize.sh` calls `../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
|
||||
```bash
|
||||
./synthesize.sh
|
||||
```
|
||||
```text
|
||||
usage: synthesize.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
|
||||
[--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT]
|
||||
[--speedyspeech-stat SPEEDYSPEECH_STAT]
|
||||
[--pwg-config PWG_CONFIG]
|
||||
[--pwg-checkpoint PWG_CHECKPOINT] [--pwg-stat PWG_STAT]
|
||||
[--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
|
||||
[--test-metadata TEST_METADATA] [--output-dir OUTPUT_DIR]
|
||||
[--inference-dir INFERENCE_DIR] [--device DEVICE]
|
||||
[--verbose VERBOSE]
|
||||
|
||||
Synthesize with speedyspeech & parallel wavegan.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--speedyspeech-config SPEEDYSPEECH_CONFIG
|
||||
config file for speedyspeech.
|
||||
--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT
|
||||
speedyspeech checkpoint to load.
|
||||
--speedyspeech-stat SPEEDYSPEECH_STAT
|
||||
mean and standard deviation used to normalize
|
||||
spectrogram when training speedyspeech.
|
||||
--pwg-config PWG_CONFIG
|
||||
config file for parallelwavegan.
|
||||
--pwg-checkpoint PWG_CHECKPOINT
|
||||
parallel wavegan generator parameters to load.
|
||||
--pwg-stat PWG_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training speedyspeech.
|
||||
--phones-dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
--tones-dict TONES_DICT
|
||||
tone vocabulary file.
|
||||
--test-metadata TEST_METADATA
|
||||
test metadata
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir
|
||||
--inference-dir INFERENCE_DIR
|
||||
dir to save inference models
|
||||
--device DEVICE device type to use
|
||||
--verbose VERBOSE verbose
|
||||
```
|
||||
`synthesize_e2e.sh` calls `synthesize_e2e.py`, which can synthesize waveform from text file.
|
||||
```bash
|
||||
./synthesize_e2e.sh
|
||||
```
|
||||
```text
|
||||
usage: synthesize_e2e.py [-h] [--speedyspeech-config SPEEDYSPEECH_CONFIG]
|
||||
[--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT]
|
||||
[--speedyspeech-stat SPEEDYSPEECH_STAT]
|
||||
[--pwg-config PWG_CONFIG]
|
||||
[--pwg-checkpoint PWG_CHECKPOINT]
|
||||
[--pwg-stat PWG_STAT] [--text TEXT]
|
||||
[--phones-dict PHONES_DICT] [--tones-dict TONES_DICT]
|
||||
[--output-dir OUTPUT_DIR]
|
||||
[--inference-dir INFERENCE_DIR] [--device DEVICE]
|
||||
[--verbose VERBOSE]
|
||||
|
||||
Synthesize with speedyspeech & parallel wavegan.
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
--speedyspeech-config SPEEDYSPEECH_CONFIG
|
||||
config file for speedyspeech.
|
||||
--speedyspeech-checkpoint SPEEDYSPEECH_CHECKPOINT
|
||||
speedyspeech checkpoint to load.
|
||||
--speedyspeech-stat SPEEDYSPEECH_STAT
|
||||
mean and standard deviation used to normalize
|
||||
spectrogram when training speedyspeech.
|
||||
--pwg-config PWG_CONFIG
|
||||
config file for parallelwavegan.
|
||||
--pwg-checkpoint PWG_CHECKPOINT
|
||||
parallel wavegan checkpoint to load.
|
||||
--pwg-stat PWG_STAT mean and standard deviation used to normalize
|
||||
spectrogram when training speedyspeech.
|
||||
--text TEXT text to synthesize, a 'utt_id sentence' pair per line
|
||||
--phones-dict PHONES_DICT
|
||||
phone vocabulary file.
|
||||
--tones-dict TONES_DICT
|
||||
tone vocabulary file.
|
||||
--output-dir OUTPUT_DIR
|
||||
output dir
|
||||
--inference-dir INFERENCE_DIR
|
||||
dir to save inference models
|
||||
--device DEVICE device type to use
|
||||
--verbose VERBOSE verbose
|
||||
```
|
||||
1. `--speedyspeech-config`, `--speedyspeech-checkpoint`, `--speedyspeech-stat` are arguments for speedyspeech, which correspond to the 3 files in the speedyspeech pretrained model.
|
||||
2. `--pwg-config`, `--pwg-checkpoint`, `--pwg-stat` are arguments for parallel wavegan, which correspond to the 3 files in the parallel wavegan pretrained model.
|
||||
3. `--text` is the text file, which contains sentences to synthesize.
|
||||
4. `--output-dir` is the directory to save synthesized audio files.
|
||||
5. `--inference-dir` is the directory to save exported model, which can be used with paddle infernece.
|
||||
6. `--device` is the type of device to run synthesis, 'cpu' and 'gpu' are supported. 'gpu' is recommended for faster synthesis.
|
||||
6. `--phones-dict` is the path of the phone vocabulary file.
|
||||
7. `--tones-dict` is the path of the tone vocabulary file.
|
||||
|
||||
You can use the following scripts to synthesize for `../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
|
||||
```bash
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 synthesize_e2e.py \
|
||||
--speedyspeech-config=speedyspeech_nosil_baker_ckpt_0.5/default.yaml \
|
||||
--speedyspeech-checkpoint=speedyspeech_nosil_baker_ckpt_0.5/snapshot_iter_11400.pdz \
|
||||
--speedyspeech-stat=speedyspeech_nosil_baker_ckpt_0.5/feats_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--inference-dir=exp/default/inference \
|
||||
--device="gpu" \
|
||||
--phones-dict=speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt \
|
||||
--tones-dict=speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
|
||||
```
|
@ -0,0 +1,50 @@
|
||||
###########################################################
|
||||
# FEATURE EXTRACTION SETTING #
|
||||
###########################################################
|
||||
fs: 24000 # Sampling rate.
|
||||
n_fft: 2048 # FFT size.
|
||||
n_shift: 300 # Hop size.
|
||||
win_length: 1200 # Window length.
|
||||
# If set to null, it will be the same as fft_size.
|
||||
window: "hann" # Window function.
|
||||
n_mels: 80 # Number of mel basis.
|
||||
fmin: 80 # Minimum freq in mel basis calculation.
|
||||
fmax: 7600 # Maximum frequency in mel basis calculation.
|
||||
|
||||
###########################################################
|
||||
# DATA SETTING #
|
||||
###########################################################
|
||||
batch_size: 64
|
||||
num_workers: 4
|
||||
|
||||
###########################################################
|
||||
# MODEL SETTING #
|
||||
###########################################################
|
||||
model:
|
||||
encoder_hidden_size: 128
|
||||
encoder_kernel_size: 3
|
||||
encoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
|
||||
duration_predictor_hidden_size: 128
|
||||
decoder_hidden_size: 128
|
||||
decoder_output_size: 80
|
||||
decoder_kernel_size: 3
|
||||
decoder_dilations: [1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 3, 9, 27, 1, 1]
|
||||
|
||||
###########################################################
|
||||
# OPTIMIZER SETTING #
|
||||
###########################################################
|
||||
optimizer:
|
||||
optim: adam # optimizer type
|
||||
learning_rate: 0.002 # learning rate
|
||||
max_grad_norm: 1
|
||||
|
||||
###########################################################
|
||||
# TRAINING SETTING #
|
||||
###########################################################
|
||||
max_epoch: 200
|
||||
num_snapshots: 5
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
seed: 10086
|
@ -0,0 +1,146 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import soundfile as sf
|
||||
from paddle import inference
|
||||
from parakeet.frontend.zh_frontend import Frontend
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Paddle Infernce with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--inference-dir", type=str, help="dir to save inference models")
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument(
|
||||
"--enable-auto-log", action="store_true", help="use auto log")
|
||||
parser.add_argument(
|
||||
"--phones-dict",
|
||||
type=str,
|
||||
default="phones.txt",
|
||||
help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--tones-dict",
|
||||
type=str,
|
||||
default="tones.txt",
|
||||
help="tone vocabulary file.")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
frontend = Frontend(
|
||||
phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
|
||||
print("frontend done!")
|
||||
|
||||
speedyspeech_config = inference.Config(
|
||||
str(Path(args.inference_dir) / "speedyspeech.pdmodel"),
|
||||
str(Path(args.inference_dir) / "speedyspeech.pdiparams"))
|
||||
speedyspeech_config.enable_use_gpu(100, 0)
|
||||
speedyspeech_config.enable_memory_optim()
|
||||
speedyspeech_predictor = inference.create_predictor(speedyspeech_config)
|
||||
|
||||
pwg_config = inference.Config(
|
||||
str(Path(args.inference_dir) / "pwg.pdmodel"),
|
||||
str(Path(args.inference_dir) / "pwg.pdiparams"))
|
||||
pwg_config.enable_use_gpu(100, 0)
|
||||
pwg_config.enable_memory_optim()
|
||||
pwg_predictor = inference.create_predictor(pwg_config)
|
||||
|
||||
if args.enable_auto_log:
|
||||
import auto_log
|
||||
os.makedirs("output", exist_ok=True)
|
||||
pid = os.getpid()
|
||||
logger = auto_log.AutoLogger(
|
||||
model_name="speedyspeech",
|
||||
model_precision='float32',
|
||||
batch_size=1,
|
||||
data_shape="dynamic",
|
||||
save_path="./output/auto_log.log",
|
||||
inference_config=speedyspeech_config,
|
||||
pids=pid,
|
||||
process_name=None,
|
||||
gpu_ids=0,
|
||||
time_keys=['preprocess_time', 'inference_time', 'postprocess_time'],
|
||||
warmup=0)
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
sentences = []
|
||||
|
||||
with open(args.text, 'rt') as f:
|
||||
for line in f:
|
||||
utt_id, sentence = line.strip().split()
|
||||
sentences.append((utt_id, sentence))
|
||||
|
||||
for utt_id, sentence in sentences:
|
||||
if args.enable_auto_log:
|
||||
logger.times.start()
|
||||
|
||||
input_ids = frontend.get_input_ids(
|
||||
sentence, merge_sentences=True, get_tone_ids=True)
|
||||
phone_ids = input_ids["phone_ids"]
|
||||
tone_ids = input_ids["tone_ids"]
|
||||
phones = phone_ids[0]
|
||||
tones = tone_ids[0]
|
||||
|
||||
if args.enable_auto_log:
|
||||
logger.times.stamp()
|
||||
|
||||
input_names = speedyspeech_predictor.get_input_names()
|
||||
phones_handle = speedyspeech_predictor.get_input_handle(input_names[0])
|
||||
tones_handle = speedyspeech_predictor.get_input_handle(input_names[1])
|
||||
|
||||
phones_handle.reshape(phones.shape)
|
||||
phones_handle.copy_from_cpu(phones)
|
||||
tones_handle.reshape(tones.shape)
|
||||
tones_handle.copy_from_cpu(tones)
|
||||
|
||||
speedyspeech_predictor.run()
|
||||
output_names = speedyspeech_predictor.get_output_names()
|
||||
output_handle = speedyspeech_predictor.get_output_handle(
|
||||
output_names[0])
|
||||
output_data = output_handle.copy_to_cpu()
|
||||
|
||||
input_names = pwg_predictor.get_input_names()
|
||||
mel_handle = pwg_predictor.get_input_handle(input_names[0])
|
||||
mel_handle.reshape(output_data.shape)
|
||||
mel_handle.copy_from_cpu(output_data)
|
||||
|
||||
pwg_predictor.run()
|
||||
output_names = pwg_predictor.get_output_names()
|
||||
output_handle = pwg_predictor.get_output_handle(output_names[0])
|
||||
wav = output_data = output_handle.copy_to_cpu()
|
||||
|
||||
if args.enable_auto_log:
|
||||
logger.times.stamp()
|
||||
|
||||
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
|
||||
|
||||
if args.enable_auto_log:
|
||||
logger.times.end(stamp=True)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
if args.enable_auto_log:
|
||||
logger.report()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
|
||||
python3 inference.py \
|
||||
--inference-dir=exp/default/inference \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/pd_infer_out \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt
|
@ -0,0 +1,65 @@
|
||||
#!/bin/bash
|
||||
|
||||
stage=0
|
||||
stop_stage=100
|
||||
|
||||
export MAIN_ROOT=`realpath ${PWD}/../../../`
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
# get durations from MFA's result
|
||||
echo "Generate durations.txt from MFA results ..."
|
||||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
|
||||
--inputdir=./baker_alignment_tone \
|
||||
--output=durations.txt \
|
||||
--config=conf/default.yaml
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "Extract features ..."
|
||||
python3 ../preprocess.py \
|
||||
--dataset=baker \
|
||||
--rootdir=~/datasets/BZNSYP/ \
|
||||
--dumpdir=dump \
|
||||
--dur-file=durations.txt \
|
||||
--config=conf/default.yaml \
|
||||
--num-cpu=20 \
|
||||
--cut-sil=True \
|
||||
--use-relative-path=True
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
echo "Get features' stats ..."
|
||||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--field-name="feats" \
|
||||
--use-relative-path=True
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
# normalize and covert phone/tone to id, dev and test should use train's stats
|
||||
echo "Normalize ..."
|
||||
python3 ../normalize.py \
|
||||
--metadata=dump/train/raw/metadata.jsonl \
|
||||
--dumpdir=dump/train/norm \
|
||||
--stats=dump/train/feats_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
||||
--use-relative-path=True
|
||||
|
||||
python3 ../normalize.py \
|
||||
--metadata=dump/dev/raw/metadata.jsonl \
|
||||
--dumpdir=dump/dev/norm \
|
||||
--stats=dump/train/feats_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
||||
--use-relative-path=True
|
||||
|
||||
python3 ../normalize.py \
|
||||
--metadata=dump/test/raw/metadata.jsonl \
|
||||
--dumpdir=dump/test/norm \
|
||||
--stats=dump/train/feats_stats.npy \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
||||
--use-relative-path=True
|
||||
|
||||
fi
|
@ -0,0 +1,12 @@
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
python ../train.py \
|
||||
--train-metadata=dump/train/norm/metadata.jsonl \
|
||||
--dev-metadata=dump/dev/norm/metadata.jsonl \
|
||||
--config=conf/default.yaml \
|
||||
--output-dir=exp/default \
|
||||
--nprocs=2 \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
||||
--use-relative-path=True
|
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python3 ../synthesize.py \
|
||||
--speedyspeech-config=conf/default.yaml \
|
||||
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
|
||||
--speedyspeech-stat=dump/train/feats_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--test-metadata=dump/test/norm/metadata.jsonl \
|
||||
--output-dir=exp/default/test \
|
||||
--inference-dir=exp/default/inference \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt \
|
||||
--device="gpu"
|
@ -0,0 +1,196 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
import paddle
|
||||
import yaml
|
||||
from paddle import jit
|
||||
from paddle.static import InputSpec
|
||||
from parakeet.frontend.zh_frontend import Frontend
|
||||
from parakeet.models.speedyspeech import SpeedySpeech
|
||||
from parakeet.models.speedyspeech import SpeedySpeechInference
|
||||
from parakeet.models.parallel_wavegan import PWGGenerator
|
||||
from parakeet.models.parallel_wavegan import PWGInference
|
||||
from parakeet.modules.normalizer import ZScore
|
||||
from yacs.config import CfgNode
|
||||
|
||||
|
||||
def evaluate(args, speedyspeech_config, pwg_config):
|
||||
# dataloader has been too verbose
|
||||
logging.getLogger("DataLoader").disabled = True
|
||||
|
||||
# construct dataset for evaluation
|
||||
sentences = []
|
||||
with open(args.text, 'rt') as f:
|
||||
for line in f:
|
||||
utt_id, sentence = line.strip().split()
|
||||
sentences.append((utt_id, sentence))
|
||||
|
||||
with open(args.phones_dict, "r") as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
vocab_size = len(phn_id)
|
||||
print("vocab_size:", vocab_size)
|
||||
with open(args.tones_dict, "r") as f:
|
||||
tone_id = [line.strip().split() for line in f.readlines()]
|
||||
tone_size = len(tone_id)
|
||||
print("tone_size:", tone_size)
|
||||
|
||||
model = SpeedySpeech(
|
||||
vocab_size=vocab_size,
|
||||
tone_size=tone_size,
|
||||
**speedyspeech_config["model"])
|
||||
model.set_state_dict(
|
||||
paddle.load(args.speedyspeech_checkpoint)["main_params"])
|
||||
model.eval()
|
||||
|
||||
vocoder = PWGGenerator(**pwg_config["generator_params"])
|
||||
vocoder.set_state_dict(paddle.load(args.pwg_checkpoint)["generator_params"])
|
||||
vocoder.remove_weight_norm()
|
||||
vocoder.eval()
|
||||
print("model done!")
|
||||
|
||||
stat = np.load(args.speedyspeech_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
speedyspeech_normalizer = ZScore(mu, std)
|
||||
|
||||
stat = np.load(args.pwg_stat)
|
||||
mu, std = stat
|
||||
mu = paddle.to_tensor(mu)
|
||||
std = paddle.to_tensor(std)
|
||||
pwg_normalizer = ZScore(mu, std)
|
||||
|
||||
speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
|
||||
model)
|
||||
speedyspeech_inference.eval()
|
||||
speedyspeech_inference = jit.to_static(
|
||||
speedyspeech_inference,
|
||||
input_spec=[
|
||||
InputSpec([-1], dtype=paddle.int64), InputSpec(
|
||||
[-1], dtype=paddle.int64)
|
||||
])
|
||||
paddle.jit.save(speedyspeech_inference,
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
speedyspeech_inference = paddle.jit.load(
|
||||
os.path.join(args.inference_dir, "speedyspeech"))
|
||||
|
||||
pwg_inference = PWGInference(pwg_normalizer, vocoder)
|
||||
pwg_inference.eval()
|
||||
pwg_inference = jit.to_static(
|
||||
pwg_inference, input_spec=[
|
||||
InputSpec([-1, 80], dtype=paddle.float32),
|
||||
])
|
||||
paddle.jit.save(pwg_inference, os.path.join(args.inference_dir, "pwg"))
|
||||
pwg_inference = paddle.jit.load(os.path.join(args.inference_dir, "pwg"))
|
||||
|
||||
frontend = Frontend(
|
||||
phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
|
||||
print("frontend done!")
|
||||
|
||||
output_dir = Path(args.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for utt_id, sentence in sentences:
|
||||
input_ids = frontend.get_input_ids(
|
||||
sentence, merge_sentences=True, get_tone_ids=True)
|
||||
phone_ids = input_ids["phone_ids"]
|
||||
tone_ids = input_ids["tone_ids"]
|
||||
|
||||
flags = 0
|
||||
for i in range(len(phone_ids)):
|
||||
part_phone_ids = phone_ids[i]
|
||||
part_tone_ids = tone_ids[i]
|
||||
with paddle.no_grad():
|
||||
mel = speedyspeech_inference(part_phone_ids, part_tone_ids)
|
||||
temp_wav = pwg_inference(mel)
|
||||
if flags == 0:
|
||||
wav = temp_wav
|
||||
flags = 1
|
||||
else:
|
||||
wav = paddle.concat([wav, temp_wav])
|
||||
sf.write(
|
||||
output_dir / (utt_id + ".wav"),
|
||||
wav.numpy(),
|
||||
samplerate=speedyspeech_config.fs)
|
||||
print(f"{utt_id} done!")
|
||||
|
||||
|
||||
def main():
|
||||
# parse args and config and redirect to train_sp
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Synthesize with speedyspeech & parallel wavegan.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-config", type=str, help="config file for speedyspeech.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-checkpoint",
|
||||
type=str,
|
||||
help="speedyspeech checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--speedyspeech-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pwg-config", type=str, help="config file for parallelwavegan.")
|
||||
parser.add_argument(
|
||||
"--pwg-checkpoint",
|
||||
type=str,
|
||||
help="parallel wavegan checkpoint to load.")
|
||||
parser.add_argument(
|
||||
"--pwg-stat",
|
||||
type=str,
|
||||
help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--text",
|
||||
type=str,
|
||||
help="text to synthesize, a 'utt_id sentence' pair per line")
|
||||
parser.add_argument(
|
||||
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--tones-dict", type=str, default=None, help="tone vocabulary file.")
|
||||
parser.add_argument("--output-dir", type=str, help="output dir")
|
||||
parser.add_argument(
|
||||
"--inference-dir", type=str, help="dir to save inference models")
|
||||
parser.add_argument(
|
||||
"--device", type=str, default="gpu", help="device type to use")
|
||||
parser.add_argument("--verbose", type=int, default=1, help="verbose")
|
||||
|
||||
args, _ = parser.parse_known_args()
|
||||
|
||||
paddle.set_device(args.device)
|
||||
|
||||
with open(args.speedyspeech_config) as f:
|
||||
speedyspeech_config = CfgNode(yaml.safe_load(f))
|
||||
with open(args.pwg_config) as f:
|
||||
pwg_config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
print("========Args========")
|
||||
print(yaml.safe_dump(vars(args)))
|
||||
print("========Config========")
|
||||
print(speedyspeech_config)
|
||||
print(pwg_config)
|
||||
|
||||
evaluate(args, speedyspeech_config, pwg_config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
FLAGS_allocator_strategy=naive_best_fit \
|
||||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
|
||||
python synthesize_e2e.py \
|
||||
--speedyspeech-config=conf/default.yaml \
|
||||
--speedyspeech-checkpoint=exp/default/checkpoints/snapshot_iter_11400.pdz \
|
||||
--speedyspeech-stat=dump/train/feats_stats.npy \
|
||||
--pwg-config=pwg_baker_ckpt_0.4/pwg_default.yaml \
|
||||
--pwg-checkpoint=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
|
||||
--pwg-stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
|
||||
--text=../sentences.txt \
|
||||
--output-dir=exp/default/test_e2e \
|
||||
--inference-dir=exp/default/inference \
|
||||
--device="gpu" \
|
||||
--phones-dict=dump/phone_id_map.txt \
|
||||
--tones-dict=dump/tone_id_map.txt
|
@ -0,0 +1,159 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Normalize feature files and dump them."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
import jsonlines
|
||||
import numpy as np
|
||||
from parakeet.datasets.data_table import DataTable
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def main():
|
||||
"""Run preprocessing process."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--metadata",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory including feature files to be normalized. "
|
||||
"you need to specify either *-scp or rootdir.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory to dump normalized feature files.")
|
||||
parser.add_argument(
|
||||
"--stats", type=str, required=True, help="statistics file.")
|
||||
parser.add_argument(
|
||||
"--phones-dict", type=str, default=None, help="phone vocabulary file.")
|
||||
parser.add_argument(
|
||||
"--tones-dict", type=str, default=None, help="tone vocabulary file.")
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
|
||||
def str2bool(str):
|
||||
return True if str.lower() == 'true' else False
|
||||
|
||||
parser.add_argument(
|
||||
"--use-relative-path",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="whether use relative path in metadata")
|
||||
args = parser.parse_args()
|
||||
|
||||
# set logger
|
||||
if args.verbose > 1:
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
elif args.verbose > 0:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
else:
|
||||
logging.basicConfig(
|
||||
level=logging.WARN,
|
||||
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
|
||||
)
|
||||
logging.warning('Skip DEBUG/INFO messages')
|
||||
|
||||
dumpdir = Path(args.dumpdir).expanduser()
|
||||
# use absolute path
|
||||
dumpdir = dumpdir.resolve()
|
||||
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# get dataset
|
||||
with jsonlines.open(args.metadata, 'r') as reader:
|
||||
metadata = list(reader)
|
||||
if args.use_relative_path:
|
||||
# if use_relative_path in preprocess, covert it to absolute path here
|
||||
metadata_dir = Path(args.metadata).parent
|
||||
for item in metadata:
|
||||
item["feats"] = str(metadata_dir / item["feats"])
|
||||
|
||||
dataset = DataTable(
|
||||
metadata, converters={
|
||||
'feats': np.load,
|
||||
})
|
||||
logging.info(f"The number of files = {len(dataset)}.")
|
||||
|
||||
# restore scaler
|
||||
scaler = StandardScaler()
|
||||
scaler.mean_ = np.load(args.stats)[0]
|
||||
scaler.scale_ = np.load(args.stats)[1]
|
||||
# from version 0.23.0, this information is needed
|
||||
scaler.n_features_in_ = scaler.mean_.shape[0]
|
||||
|
||||
vocab_phones = {}
|
||||
with open(args.phones_dict, 'rt') as f:
|
||||
phn_id = [line.strip().split() for line in f.readlines()]
|
||||
for phn, id in phn_id:
|
||||
vocab_phones[phn] = int(id)
|
||||
|
||||
vocab_tones = {}
|
||||
with open(args.tones_dict, 'rt') as f:
|
||||
tone_id = [line.strip().split() for line in f.readlines()]
|
||||
for tone, id in tone_id:
|
||||
vocab_tones[tone] = int(id)
|
||||
|
||||
# process each file
|
||||
output_metadata = []
|
||||
|
||||
for item in tqdm(dataset):
|
||||
utt_id = item['utt_id']
|
||||
mel = item['feats']
|
||||
# normalize
|
||||
mel = scaler.transform(mel)
|
||||
|
||||
# save
|
||||
mel_path = dumpdir / f"{utt_id}_feats.npy"
|
||||
np.save(mel_path, mel.astype(np.float32), allow_pickle=False)
|
||||
phone_ids = [vocab_phones[p] for p in item['phones']]
|
||||
tone_ids = [vocab_tones[p] for p in item['tones']]
|
||||
if args.use_relative_path:
|
||||
# convert absolute path to relative path:
|
||||
mel_path = mel_path.relative_to(dumpdir)
|
||||
output_metadata.append({
|
||||
'utt_id': utt_id,
|
||||
'phones': phone_ids,
|
||||
'tones': tone_ids,
|
||||
'num_phones': item['num_phones'],
|
||||
'num_frames': item['num_frames'],
|
||||
'durations': item['durations'],
|
||||
'feats': str(mel_path),
|
||||
})
|
||||
output_metadata.sort(key=itemgetter('utt_id'))
|
||||
output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
|
||||
with jsonlines.open(output_metadata_path, 'w') as writer:
|
||||
for item in output_metadata:
|
||||
writer.write(item)
|
||||
logging.info(f"metadata dumped into {output_metadata_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,293 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from operator import itemgetter
|
||||
from typing import Any
|
||||
from typing import Dict
|
||||
from typing import List
|
||||
|
||||
import argparse
|
||||
import jsonlines
|
||||
import librosa
|
||||
import numpy as np
|
||||
import re
|
||||
import tqdm
|
||||
import yaml
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from parakeet.data.get_feats import LogMelFBank
|
||||
from parakeet.datasets.preprocess_utils import compare_duration_and_mel_length
|
||||
from parakeet.datasets.preprocess_utils import get_phones_tones
|
||||
from parakeet.datasets.preprocess_utils import get_phn_dur
|
||||
from parakeet.datasets.preprocess_utils import merge_silence
|
||||
from pathlib import Path
|
||||
from yacs.config import CfgNode
|
||||
|
||||
|
||||
def process_sentence(config: Dict[str, Any],
|
||||
fp: Path,
|
||||
sentences: Dict,
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
cut_sil: bool=True):
|
||||
utt_id = fp.stem
|
||||
record = None
|
||||
if utt_id in sentences:
|
||||
# reading, resampling may occur
|
||||
wav, _ = librosa.load(str(fp), sr=config.fs)
|
||||
if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
|
||||
return record
|
||||
assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
|
||||
assert np.abs(wav).max(
|
||||
) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
|
||||
phones = sentences[utt_id][0]
|
||||
durations = sentences[utt_id][1]
|
||||
speaker = sentences[utt_id][2]
|
||||
d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
|
||||
# little imprecise than use *.TextGrid directly
|
||||
times = librosa.frames_to_time(
|
||||
d_cumsum, sr=config.fs, hop_length=config.n_shift)
|
||||
if cut_sil:
|
||||
start = 0
|
||||
end = d_cumsum[-1]
|
||||
if phones[0] == "sil" and len(durations) > 1:
|
||||
start = times[1]
|
||||
durations = durations[1:]
|
||||
phones = phones[1:]
|
||||
if phones[-1] == 'sil' and len(durations) > 1:
|
||||
end = times[-2]
|
||||
durations = durations[:-1]
|
||||
phones = phones[:-1]
|
||||
sentences[utt_id][0] = phones
|
||||
sentences[utt_id][1] = durations
|
||||
start, end = librosa.time_to_samples([start, end], sr=config.fs)
|
||||
wav = wav[start:end]
|
||||
|
||||
# extract mel feats
|
||||
logmel = mel_extractor.get_log_mel_fbank(wav)
|
||||
# change duration according to mel_length
|
||||
compare_duration_and_mel_length(sentences, utt_id, logmel)
|
||||
labels = sentences[utt_id][0]
|
||||
# extract phone and duration
|
||||
phones = []
|
||||
tones = []
|
||||
for label in labels:
|
||||
# split tone from finals
|
||||
match = re.match(r'^(\w+)([012345])$', label)
|
||||
if match:
|
||||
phones.append(match.group(1))
|
||||
tones.append(match.group(2))
|
||||
else:
|
||||
phones.append(label)
|
||||
tones.append('0')
|
||||
durations = sentences[utt_id][1]
|
||||
num_frames = logmel.shape[0]
|
||||
assert sum(durations) == num_frames
|
||||
assert len(phones) == len(tones) == len(durations)
|
||||
|
||||
mel_path = output_dir / (utt_id + "_feats.npy")
|
||||
np.save(mel_path, logmel) # (num_frames, n_mels)
|
||||
record = {
|
||||
"utt_id": utt_id,
|
||||
"phones": phones,
|
||||
"tones": tones,
|
||||
"num_phones": len(phones),
|
||||
"num_frames": num_frames,
|
||||
"durations": durations,
|
||||
"feats": str(mel_path), # Path object
|
||||
}
|
||||
return record
|
||||
|
||||
|
||||
def process_sentences(config,
|
||||
fps: List[Path],
|
||||
sentences: Dict,
|
||||
output_dir: Path,
|
||||
mel_extractor=None,
|
||||
nprocs: int=1,
|
||||
cut_sil: bool=True,
|
||||
use_relative_path: bool=False):
|
||||
if nprocs == 1:
|
||||
results = []
|
||||
for fp in tqdm.tqdm(fps, total=len(fps)):
|
||||
record = process_sentence(config, fp, sentences, output_dir,
|
||||
mel_extractor, cut_sil)
|
||||
if record:
|
||||
results.append(record)
|
||||
else:
|
||||
with ThreadPoolExecutor(nprocs) as pool:
|
||||
futures = []
|
||||
with tqdm.tqdm(total=len(fps)) as progress:
|
||||
for fp in fps:
|
||||
future = pool.submit(process_sentence, config, fp,
|
||||
sentences, output_dir, mel_extractor,
|
||||
cut_sil)
|
||||
future.add_done_callback(lambda p: progress.update())
|
||||
futures.append(future)
|
||||
|
||||
results = []
|
||||
for ft in futures:
|
||||
record = ft.result()
|
||||
if record:
|
||||
results.append(record)
|
||||
|
||||
results.sort(key=itemgetter("utt_id"))
|
||||
output_dir = Path(output_dir)
|
||||
metadata_path = output_dir / "metadata.jsonl"
|
||||
# NOTE: use relative path to the meta jsonlines file for Full Chain Project
|
||||
with jsonlines.open(metadata_path, 'w') as writer:
|
||||
for item in results:
|
||||
if use_relative_path:
|
||||
item["feats"] = str(Path(item["feats"]).relative_to(output_dir))
|
||||
writer.write(item)
|
||||
print("Done")
|
||||
|
||||
|
||||
def main():
|
||||
# parse config and args
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Preprocess audio and then extract features.")
|
||||
|
||||
parser.add_argument(
|
||||
"--dataset",
|
||||
default="baker",
|
||||
type=str,
|
||||
help="name of dataset, should in {baker} now")
|
||||
|
||||
parser.add_argument(
|
||||
"--rootdir", default=None, type=str, help="directory to dataset.")
|
||||
parser.add_argument(
|
||||
"--dumpdir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="directory to dump feature files.")
|
||||
|
||||
parser.add_argument(
|
||||
"--dur-file",
|
||||
default=None,
|
||||
type=str,
|
||||
help="path to baker durations.txt.")
|
||||
|
||||
parser.add_argument("--config", type=str, help="fastspeech2 config file.")
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
type=int,
|
||||
default=1,
|
||||
help="logging level. higher is more logging. (default=1)")
|
||||
parser.add_argument(
|
||||
"--num-cpu", type=int, default=1, help="number of process.")
|
||||
|
||||
def str2bool(str):
|
||||
return True if str.lower() == 'true' else False
|
||||
|
||||
parser.add_argument(
|
||||
"--cut-sil",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="whether cut sil in the edge of audio")
|
||||
|
||||
parser.add_argument(
|
||||
"--use-relative-path",
|
||||
type=str2bool,
|
||||
default=False,
|
||||
help="whether use relative path in metadata")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
rootdir = Path(args.rootdir).expanduser()
|
||||
dumpdir = Path(args.dumpdir).expanduser()
|
||||
# use absolute path
|
||||
dumpdir = dumpdir.resolve()
|
||||
dumpdir.mkdir(parents=True, exist_ok=True)
|
||||
dur_file = Path(args.dur_file).expanduser()
|
||||
|
||||
assert rootdir.is_dir()
|
||||
assert dur_file.is_file()
|
||||
|
||||
with open(args.config, 'rt') as f:
|
||||
config = CfgNode(yaml.safe_load(f))
|
||||
|
||||
if args.verbose > 1:
|
||||
print(vars(args))
|
||||
print(config)
|
||||
|
||||
sentences, speaker_set = get_phn_dur(dur_file)
|
||||
|
||||
merge_silence(sentences)
|
||||
phone_id_map_path = dumpdir / "phone_id_map.txt"
|
||||
tone_id_map_path = dumpdir / "tone_id_map.txt"
|
||||
get_phones_tones(sentences, phone_id_map_path, tone_id_map_path,
|
||||
args.dataset)
|
||||
|
||||
if args.dataset == "baker":
|
||||
wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
|
||||
# split data into 3 sections
|
||||
num_train = 9800
|
||||
num_dev = 100
|
||||
train_wav_files = wav_files[:num_train]
|
||||
dev_wav_files = wav_files[num_train:num_train + num_dev]
|
||||
test_wav_files = wav_files[num_train + num_dev:]
|
||||
|
||||
train_dump_dir = dumpdir / "train" / "raw"
|
||||
train_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
dev_dump_dir = dumpdir / "dev" / "raw"
|
||||
dev_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
test_dump_dir = dumpdir / "test" / "raw"
|
||||
test_dump_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Extractor
|
||||
mel_extractor = LogMelFBank(
|
||||
sr=config.fs,
|
||||
n_fft=config.n_fft,
|
||||
hop_length=config.n_shift,
|
||||
win_length=config.win_length,
|
||||
window=config.window,
|
||||
n_mels=config.n_mels,
|
||||
fmin=config.fmin,
|
||||
fmax=config.fmax)
|
||||
|
||||
# process for the 3 sections
|
||||
if train_wav_files:
|
||||
process_sentences(
|
||||
config,
|
||||
train_wav_files,
|
||||
sentences,
|
||||
train_dump_dir,
|
||||
mel_extractor,
|
||||
nprocs=args.num_cpu,
|
||||
cut_sil=args.cut_sil,
|
||||
use_relative_path=args.use_relative_path)
|
||||
if dev_wav_files:
|
||||
process_sentences(
|
||||
config,
|
||||
dev_wav_files,
|
||||
sentences,
|
||||
dev_dump_dir,
|
||||
mel_extractor,
|
||||
cut_sil=args.cut_sil,
|
||||
use_relative_path=args.use_relative_path)
|
||||
if test_wav_files:
|
||||
process_sentences(
|
||||
config,
|
||||
test_wav_files,
|
||||
sentences,
|
||||
test_dump_dir,
|
||||
mel_extractor,
|
||||
nprocs=args.num_cpu,
|
||||
cut_sil=args.cut_sil,
|
||||
use_relative_path=args.use_relative_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -0,0 +1,16 @@
|
||||
001 凯莫瑞安联合体的经济崩溃,迫在眉睫。
|
||||
002 对于所有想要离开那片废土,去寻找更美好生活的人来说。
|
||||
003 克哈,是你们所有人安全的港湾。
|
||||
004 为了保护尤摩扬人民不受异虫的残害,我所做的,比他们自己的领导委员会都多。
|
||||
005 无论他们如何诽谤我,我将继续为所有泰伦人的最大利益,而努力奋斗。
|
||||
006 身为你们的元首,我带领泰伦人实现了人类统治领地和经济的扩张。
|
||||
007 我们将继续成长,用行动回击那些只会说风凉话,不愿意和我们相向而行的害群之马。
|
||||
008 帝国武装力量,无数的优秀儿女,正时刻守卫着我们的家园大门,但是他们孤木难支。
|
||||
009 凡是今天应征入伍者,所获的所有刑罚罪责,减半。
|
||||
010 激进分子和异见者希望你们一听见枪声,就背弃多年的和平与繁荣。
|
||||
011 他们没有勇气和能力,带领人类穿越一个充满危险的星系。
|
||||
012 法治是我们的命脉,然而它却受到前所未有的挑战。
|
||||
013 我将恢复我们帝国的荣光,绝不会向任何外星势力低头。
|
||||
014 我已经驯服了异虫,荡平了星灵。如今它们的创造者,想要夺走我们拥有的一切。
|
||||
015 永远记住,谁才是最能保护你们的人。
|
||||
016 不要听信别人的谗言,我不是什么克隆人。
|