From 84576d695601f236d20bdac26ceab67b4f40755f Mon Sep 17 00:00:00 2001 From: qingen Date: Fri, 8 Apr 2022 18:06:55 +0800 Subject: [PATCH 01/18] [vec][score] add plda model, test=doc fix #1667 --- paddlespeech/vector/cluster/diarization.py | 207 +++++++- paddlespeech/vector/cluster/plda.py | 585 +++++++++++++++++++++ 2 files changed, 783 insertions(+), 9 deletions(-) create mode 100644 paddlespeech/vector/cluster/plda.py diff --git a/paddlespeech/vector/cluster/diarization.py b/paddlespeech/vector/cluster/diarization.py index 6432acb8..c3dbb120 100644 --- a/paddlespeech/vector/cluster/diarization.py +++ b/paddlespeech/vector/cluster/diarization.py @@ -16,22 +16,22 @@ This script contains basic functions used for speaker diarization. This script has an optional dependency on open source sklearn library. A few sklearn functions are modified in this script as per requirement. """ - import argparse +import copy import warnings -import scipy + import numpy as np +import scipy +import sklearn from distutils.util import strtobool - +from scipy import linalg from scipy import sparse -from scipy.sparse.linalg import eigsh from scipy.sparse.csgraph import connected_components from scipy.sparse.csgraph import laplacian as csgraph_laplacian - -import sklearn -from sklearn.neighbors import kneighbors_graph +from scipy.sparse.linalg import eigsh from sklearn.cluster import SpectralClustering from sklearn.cluster._kmeans import k_means +from sklearn.neighbors import kneighbors_graph def _graph_connected_component(graph, node_id): @@ -347,6 +347,8 @@ class EmbeddingMeta: --------- segset : list List of session IDs as an array of strings. + modelset : list + List of model IDs as an array of strings. stats : tensor An ndarray of float64. Each line contains embedding from the corresponding session. @@ -355,15 +357,20 @@ class EmbeddingMeta: def __init__( self, segset=None, + modelset=None, stats=None, ): if segset is None: - self.segset = numpy.empty(0, dtype="|O") - self.stats = numpy.array([], dtype=np.float64) + self.segset = np.empty(0, dtype="|O") + self.modelset = np.empty(0, dtype="|O") + self.stats = np.array([], dtype=np.float64) else: self.segset = segset + self.modelset = modelset self.stats = stats + self.stat0 = np.array([[1.0]] * self.stats.shape[0]) + def norm_stats(self): """ Divide all first-order statistics by their Euclidean norm. @@ -372,6 +379,188 @@ class EmbeddingMeta: vect_norm = np.clip(np.linalg.norm(self.stats, axis=1), 1e-08, np.inf) self.stats = (self.stats.transpose() / vect_norm).transpose() + def get_mean_stats(self): + """ + Return the mean of first order statistics. + """ + mu = np.mean(self.stats, axis=0) + return mu + + def get_total_covariance_stats(self): + """ + Compute and return the total covariance matrix of the first-order statistics. + """ + C = self.stats - self.stats.mean(axis=0) + return np.dot(C.transpose(), C) / self.stats.shape[0] + + def get_model_stat0(self, mod_id): + """Return zero-order statistics of a given model + + Arguments + --------- + mod_id : str + ID of the model which stat0 will be returned. + """ + S = self.stat0[self.modelset == mod_id, :] + return S + + def get_model_stats(self, mod_id): + """Return first-order statistics of a given model. + + Arguments + --------- + mod_id : str + ID of the model which stat1 will be returned. + """ + return self.stats[self.modelset == mod_id, :] + + def sum_stat_per_model(self): + """ + Sum the zero- and first-order statistics per model and store them + in a new EmbeddingMeta. + Returns a EmbeddingMeta object with the statistics summed per model + and a numpy array with session_per_model. + """ + + sts_per_model = EmbeddingMeta() + sts_per_model.modelset = np.unique( + self.modelset) # nd: get uniq spkr ids + sts_per_model.segset = copy.deepcopy(sts_per_model.modelset) + sts_per_model.stat0 = np.zeros( + (sts_per_model.modelset.shape[0], self.stat0.shape[1]), + dtype=np.float64, ) + sts_per_model.stats = np.zeros( + (sts_per_model.modelset.shape[0], self.stats.shape[1]), + dtype=np.float64, ) + + session_per_model = np.zeros(np.unique(self.modelset).shape[0]) + + # For each model sum the stats + for idx, model in enumerate(sts_per_model.modelset): + sts_per_model.stat0[idx, :] = self.get_model_stat0(model).sum( + axis=0) + sts_per_model.stats[idx, :] = self.get_model_stats(model).sum( + axis=0) + session_per_model[idx] += self.get_model_stats(model).shape[0] + return sts_per_model, session_per_model + + def center_stats(self, mu): + """ + Center first order statistics. + + Arguments + --------- + mu : array + Array to center on. + """ + + dim = self.stats.shape[1] / self.stat0.shape[1] + index_map = np.repeat(np.arange(self.stat0.shape[1]), dim) + self.stats = self.stats - (self.stat0[:, index_map] * + mu.astype(np.float64)) + + def rotate_stats(self, R): + """ + Rotate first-order statistics by a right-product. + + Arguments + --------- + R : ndarray + Matrix to use for right product on the first order statistics. + """ + self.stats = np.dot(self.stats, R) + + def whiten_stats(self, mu, sigma, isSqrInvSigma=False): + """ + Whiten first-order statistics + If sigma.ndim == 1, case of a diagonal covariance. + If sigma.ndim == 2, case of a single Gaussian with full covariance. + If sigma.ndim == 3, case of a full covariance UBM. + + Arguments + --------- + mu : array + Mean vector to be subtracted from the statistics. + sigma : narray + Co-variance matrix or covariance super-vector. + isSqrInvSigma : bool + True if the input Sigma matrix is the inverse of the square root of a covariance matrix. + """ + + if sigma.ndim == 1: + self.center_stats(mu) + self.stats = self.stats / np.sqrt(sigma.astype(np.float64)) + + elif sigma.ndim == 2: + # Compute the inverse square root of the co-variance matrix Sigma + sqr_inv_sigma = sigma + + if not isSqrInvSigma: + # eigen_values, eigen_vectors = scipy.linalg.eigh(sigma) + eigen_values, eigen_vectors = linalg.eigh(sigma) + ind = eigen_values.real.argsort()[::-1] + eigen_values = eigen_values.real[ind] + eigen_vectors = eigen_vectors.real[:, ind] + + sqr_inv_eval_sigma = 1 / np.sqrt(eigen_values.real) + sqr_inv_sigma = np.dot(eigen_vectors, + np.diag(sqr_inv_eval_sigma)) + else: + pass + + # Whitening of the first-order statistics + self.center_stats(mu) # CENTERING + self.rotate_stats(sqr_inv_sigma) + + elif sigma.ndim == 3: + # we assume that sigma is a 3D ndarray of size D x n x n + # where D is the number of distributions and n is the dimension of a single distribution + n = self.stats.shape[1] // self.stat0.shape[1] + sess_nb = self.stat0.shape[0] + self.center_stats(mu) + self.stats = (np.einsum("ikj,ikl->ilj", + self.stats.T.reshape(-1, n, sess_nb), sigma) + .reshape(-1, sess_nb).T) + + else: + raise Exception("Wrong dimension of Sigma, must be 1 or 2") + + def align_models(self, model_list): + """ + Align models of the current EmbeddingMeta to match a list of models + provided as input parameter. The size of the StatServer might be + reduced to match the input list of models. + + Arguments + --------- + model_list : ndarray of strings + List of models to match. + """ + indx = np.array( + [np.argwhere(self.modelset == v)[0][0] for v in model_list]) + self.segset = self.segset[indx] + self.modelset = self.modelset[indx] + self.stat0 = self.stat0[indx, :] + self.stats = self.stats[indx, :] + + def align_segments(self, segment_list): + """ + Align segments of the current EmbeddingMeta to match a list of segment + provided as input parameter. The size of the StatServer might be + reduced to match the input list of segments. + + Arguments + --------- + segment_list: ndarray of strings + list of segments to match + """ + indx = np.array( + [np.argwhere(self.segset == v)[0][0] for v in segment_list]) + self.segset = self.segset[indx] + self.modelset = self.modelset[indx] + self.stat0 = self.stat0[indx, :] + self.stats = self.stats[indx, :] + class SpecClustUnorm: """ diff --git a/paddlespeech/vector/cluster/plda.py b/paddlespeech/vector/cluster/plda.py new file mode 100644 index 00000000..474a70e2 --- /dev/null +++ b/paddlespeech/vector/cluster/plda.py @@ -0,0 +1,585 @@ +# Copyright (c) 2022 SpeechBrain Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A popular speaker recognition/diarization model (LDA and PLDA). + +Relevant Papers + - This implementation of PLDA is based on the following papers. + + - PLDA model Training + * Ye Jiang et. al, "PLDA Modeling in I-Vector and Supervector Space for Speaker Verification," in Interspeech, 2012. + * Patrick Kenny et. al, "PLDA for speaker verification with utterances of arbitrary duration," in ICASSP, 2013. + + - PLDA scoring (fast scoring) + * Daniel Garcia-Romero et. al, “Analysis of i-vector length normalization in speaker recognition systems,” in Interspeech, 2011. + * Weiwei-LIN et. al, "Fast Scoring for PLDA with Uncertainty Propagation," in Odyssey, 2016. + * Kong Aik Lee et. al, "Multi-session PLDA Scoring of I-vector for Partially Open-Set Speaker Detection," in Interspeech 2013. + +Credits + This code is adapted from: https://git-lium.univ-lemans.fr/Larcher/sidekit +""" +import copy +import pickle + +import numpy +from scipy import linalg + +from paddlespeech.vector.cluster.diarization import EmbeddingMeta + + +def ismember(list1, list2): + c = [item in list2 for item in list1] + return c + + +class Ndx: + """ + A class that encodes trial index information. It has a list of + model names and a list of test segment names and a matrix + indicating which combinations of model and test segment are + trials of interest. + + Arguments + --------- + modelset : list + List of unique models in a ndarray. + segset : list + List of unique test segments in a ndarray. + trialmask : 2D ndarray of bool. + Rows correspond to the models and columns to the test segments. True, if the trial is of interest. + """ + + def __init__(self, + ndx_file_name="", + models=numpy.array([]), + testsegs=numpy.array([])): + """ + Initialize a Ndx object by loading information from a file. + + Arguments + --------- + ndx_file_name : str + Name of the file to load. + """ + self.modelset = numpy.empty(0, dtype="|O") + self.segset = numpy.empty(0, dtype="|O") + self.trialmask = numpy.array([], dtype="bool") + + if ndx_file_name == "": + # This is needed to make sizes same + d = models.shape[0] - testsegs.shape[0] + if d != 0: + if d > 0: + last = str(testsegs[-1]) + pad = numpy.array([last] * d) + testsegs = numpy.hstack((testsegs, pad)) + # pad = testsegs[-d:] + # testsegs = numpy.concatenate((testsegs, pad), axis=1) + else: + d = abs(d) + last = str(models[-1]) + pad = numpy.array([last] * d) + models = numpy.hstack((models, pad)) + # pad = models[-d:] + # models = numpy.concatenate((models, pad), axis=1) + + modelset = numpy.unique(models) + segset = numpy.unique(testsegs) + + trialmask = numpy.zeros( + (modelset.shape[0], segset.shape[0]), dtype="bool") + for m in range(modelset.shape[0]): + segs = testsegs[numpy.array(ismember(models, modelset[m]))] + trialmask[m, ] = ismember(segset, segs) # noqa E231 + + self.modelset = modelset + self.segset = segset + self.trialmask = trialmask + assert self.validate(), "Wrong Ndx format" + + else: + ndx = Ndx.read(ndx_file_name) + self.modelset = ndx.modelset + self.segset = ndx.segset + self.trialmask = ndx.trialmask + + def save_ndx_object(self, output_file_name): + with open(output_file_name, "wb") as output: + pickle.dump(self, output, pickle.HIGHEST_PROTOCOL) + + def filter(self, modlist, seglist, keep): + """ + Removes some of the information in an Ndx. Useful for creating a + gender specific Ndx from a pooled gender Ndx. Depending on the + value of \'keep\', the two input lists indicate the strings to + retain or the strings to discard. + + Arguments + --------- + modlist : array + A cell array of strings which will be compared with the modelset of 'inndx'. + seglist : array + A cell array of strings which will be compared with the segset of 'inndx'. + keep : bool + Indicating whether modlist and seglist are the models to keep or discard. + """ + if keep: + keepmods = modlist + keepsegs = seglist + else: + keepmods = diff(self.modelset, modlist) + keepsegs = diff(self.segset, seglist) + + keepmodidx = numpy.array(ismember(self.modelset, keepmods)) + keepsegidx = numpy.array(ismember(self.segset, keepsegs)) + + outndx = Ndx() + outndx.modelset = self.modelset[keepmodidx] + outndx.segset = self.segset[keepsegidx] + tmp = self.trialmask[numpy.array(keepmodidx), :] + outndx.trialmask = tmp[:, numpy.array(keepsegidx)] + + assert outndx.validate, "Wrong Ndx format" + + if self.modelset.shape[0] > outndx.modelset.shape[0]: + print( + "Number of models reduced from %d to %d" % + self.modelset.shape[0], + outndx.modelset.shape[0], ) + if self.segset.shape[0] > outndx.segset.shape[0]: + print( + "Number of test segments reduced from %d to %d", + self.segset.shape[0], + outndx.segset.shape[0], ) + return outndx + + def validate(self): + """ + Checks that an object of type Ndx obeys certain rules that + must always be true. Returns a boolean value indicating whether the object is valid + """ + ok = isinstance(self.modelset, numpy.ndarray) + ok &= isinstance(self.segset, numpy.ndarray) + ok &= isinstance(self.trialmask, numpy.ndarray) + + ok &= self.modelset.ndim == 1 + ok &= self.segset.ndim == 1 + ok &= self.trialmask.ndim == 2 + + ok &= self.trialmask.shape == (self.modelset.shape[0], + self.segset.shape[0], ) + return ok + + +class Scores: + """ + A class for storing scores for trials. The modelset and segset + fields are lists of model and test segment names respectively. + The element i,j of scoremat and scoremask corresponds to the + trial involving model i and test segment j. + + Arguments + --------- + modelset : list + List of unique models in a ndarray. + segset : list + List of unique test segments in a ndarray. + scoremask : 2D ndarray of bool + Indicates the trials of interest, i.e., + the entry i,j in scoremat should be ignored if scoremask[i,j] is False. + scoremat : 2D ndarray + Scores matrix. + """ + + def __init__(self, scores_file_name=""): + """ + Initialize a Scores object by loading information from a file HDF5 format. + + Arguments + --------- + scores_file_name : str + Name of the file to load. + """ + self.modelset = numpy.empty(0, dtype="|O") + self.segset = numpy.empty(0, dtype="|O") + self.scoremask = numpy.array([], dtype="bool") + self.scoremat = numpy.array([]) + + if scores_file_name == "": + pass + else: + tmp = Scores.read(scores_file_name) + self.modelset = tmp.modelset + self.segset = tmp.segset + self.scoremask = tmp.scoremask + self.scoremat = tmp.scoremat + + def __repr__(self): + ch = "modelset:\n" + ch += self.modelset + "\n" + ch += "segset:\n" + ch += self.segset + "\n" + ch += "scoremask:\n" + ch += self.scoremask.__repr__() + "\n" + ch += "scoremat:\n" + ch += self.scoremat.__repr__() + "\n" + + +def fa_model_loop( + batch_start, + mini_batch_indices, + factor_analyser, + stat0, + stats, + e_h, + e_hh, ): + """ + A function for PLDA estimation. + + Arguments + --------- + batch_start : int + Index to start at in the list. + mini_batch_indices : list + Indices of the elements in the list (should start at zero). + factor_analyser : instance of PLDA class + PLDA class object. + stat0 : tensor + Matrix of zero-order statistics. + stats: tensor + Matrix of first-order statistics. + e_h : tensor + An accumulator matrix. + e_hh: tensor + An accumulator matrix. + """ + rank = factor_analyser.F.shape[1] + if factor_analyser.Sigma.ndim == 2: + A = factor_analyser.F.T.dot(factor_analyser.F) + inv_lambda_unique = dict() + for sess in numpy.unique(stat0[:, 0]): + inv_lambda_unique[sess] = linalg.inv(sess * A + numpy.eye(A.shape[ + 0])) + + tmp = numpy.zeros( + (factor_analyser.F.shape[1], factor_analyser.F.shape[1]), + dtype=numpy.float64, ) + + for idx in mini_batch_indices: + if factor_analyser.Sigma.ndim == 1: + inv_lambda = linalg.inv( + numpy.eye(rank) + (factor_analyser.F.T * stat0[ + idx + batch_start, :]).dot(factor_analyser.F)) + else: + inv_lambda = inv_lambda_unique[stat0[idx + batch_start, 0]] + + aux = factor_analyser.F.T.dot(stats[idx + batch_start, :]) + numpy.dot(aux, inv_lambda, out=e_h[idx]) + e_hh[idx] = inv_lambda + numpy.outer(e_h[idx], e_h[idx], tmp) + + +def _check_missing_model(enroll, test, ndx): + # Remove missing models and test segments + clean_ndx = ndx.filter(enroll.modelset, test.segset, True) + + # Align EmbeddingMeta to match the clean_ndx + enroll.align_models(clean_ndx.modelset) + test.align_segments(clean_ndx.segset) + + return clean_ndx + + +def fast_PLDA_scoring( + enroll, + test, + ndx, + mu, + F, + Sigma, + test_uncertainty=None, + Vtrans=None, + p_known=0.0, + scaling_factor=1.0, + check_missing=True, ): + """ + Compute the PLDA scores between to sets of vectors. The list of + trials to perform is given in an Ndx object. PLDA matrices have to be + pre-computed. i-vectors/x-vectors are supposed to be whitened before. + + Arguments + --------- + enroll : speechbrain.utils.Xvector_PLDA_sp.StatObject_SB + A StatServer in which stat1 are xvectors. + test : speechbrain.utils.Xvector_PLDA_sp.StatObject_SB + A StatServer in which stat1 are xvectors. + ndx : speechbrain.utils.Xvector_PLDA_sp.Ndx + An Ndx object defining the list of trials to perform. + mu : double + The mean vector of the PLDA gaussian. + F : tensor + The between-class co-variance matrix of the PLDA. + Sigma: tensor + The residual covariance matrix. + p_known : float + Probability of having a known speaker for open-set + identification case (=1 for the verification task and =0 for the + closed-set case). + check_missing : bool + If True, check that all models and segments exist. + """ + + enroll_ctr = copy.deepcopy(enroll) + test_ctr = copy.deepcopy(test) + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll_ctr, test_ctr, ndx) + else: + clean_ndx = ndx + + # Center the i-vectors around the PLDA mean + enroll_ctr.center_stats(mu) + test_ctr.center_stats(mu) + + # Compute constant component of the PLDA distribution + invSigma = linalg.inv(Sigma) + I_spk = numpy.eye(F.shape[1], dtype="float") + + K = F.T.dot(invSigma * scaling_factor).dot(F) + K1 = linalg.inv(K + I_spk) + K2 = linalg.inv(2 * K + I_spk) + + # Compute the Gaussian distribution constant + alpha1 = numpy.linalg.slogdet(K1)[1] + alpha2 = numpy.linalg.slogdet(K2)[1] + plda_cst = alpha2 / 2.0 - alpha1 + + # Compute intermediate matrices + Sigma_ac = numpy.dot(F, F.T) + Sigma_tot = Sigma_ac + Sigma + Sigma_tot_inv = linalg.inv(Sigma_tot) + + Tmp = linalg.inv(Sigma_tot - Sigma_ac.dot(Sigma_tot_inv).dot(Sigma_ac)) + Phi = Sigma_tot_inv - Tmp + Psi = Sigma_tot_inv.dot(Sigma_ac).dot(Tmp) + + # Compute the different parts of PLDA score + model_part = 0.5 * numpy.einsum("ij, ji->i", + enroll_ctr.stats.dot(Phi), + enroll_ctr.stats.T) + seg_part = 0.5 * numpy.einsum("ij, ji->i", + test_ctr.stats.dot(Phi), test_ctr.stats.T) + + # Compute verification scores + score = Scores() # noqa F821 + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + + score.scoremat = model_part[:, numpy.newaxis] + seg_part + plda_cst + score.scoremat += enroll_ctr.stats.dot(Psi).dot(test_ctr.stats.T) + score.scoremat *= scaling_factor + + # Case of open-set identification, we compute the log-likelihood + # by taking into account the probability of having a known impostor + # or an out-of set class + if p_known != 0: + N = score.scoremat.shape[0] + open_set_scores = numpy.empty(score.scoremat.shape) + tmp = numpy.exp(score.scoremat) + for ii in range(N): + # open-set term + open_set_scores[ii, :] = score.scoremat[ii, :] - numpy.log( + p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / ( + N - 1) + (1 - p_known)) + score.scoremat = open_set_scores + + return score + + +class PLDA: + """ + A class to train PLDA model from embeddings. + + The input is in paddlespeech.vector.cluster.diarization.EmbeddingMeta format. + Trains a simplified PLDA model no within-class covariance matrix but full residual covariance matrix. + + Arguments + --------- + mean : tensor + Mean of the vectors. + F : tensor + Eigenvoice matrix. + Sigma : tensor + Residual matrix. + """ + + def __init__( + self, + mean=None, + F=None, + Sigma=None, + rank_f=100, + nb_iter=10, + scaling_factor=1.0, ): + self.mean = None + self.F = None + self.Sigma = None + self.rank_f = rank_f + self.nb_iter = nb_iter + self.scaling_factor = scaling_factor + + if mean is not None: + self.mean = mean + if F is not None: + self.F = F + if Sigma is not None: + self.Sigma = Sigma + + def plda( + self, + emb_meta=None, + output_file_name=None, ): + """ + Trains PLDA model with no within class covariance matrix but full residual covariance matrix. + + Arguments + --------- + emb_meta : paddlespeech.vector.cluster.diarization.EmbeddingMeta + Contains vectors and meta-information to perform PLDA + rank_f : int + Rank of the between-class covariance matrix. + nb_iter : int + Number of iterations to run. + scaling_factor : float + Scaling factor to downscale statistics (value between 0 and 1). + output_file_name : str + Name of the output file where to store PLDA model. + """ + + # Dimension of the vector (x-vectors stored in stats) + vect_size = emb_meta.stats.shape[1] + + # Initialize mean and residual covariance from the training data + self.mean = emb_meta.get_mean_stats() + self.Sigma = emb_meta.get_total_covariance_stats() + + # Sum stat0 and stat1 for each speaker model + model_shifted_stat, session_per_model = emb_meta.sum_stat_per_model() + + # Number of speakers (classes) in training set + class_nb = model_shifted_stat.modelset.shape[0] + + # Multiply statistics by scaling_factor + model_shifted_stat.stat0 *= self.scaling_factor + model_shifted_stat.stats *= self.scaling_factor + session_per_model *= self.scaling_factor + + # Covariance for stats + sigma_obs = emb_meta.get_total_covariance_stats() + evals, evecs = linalg.eigh(sigma_obs) + + # Initial F (eigen voice matrix) from rank + idx = numpy.argsort(evals)[::-1] + evecs = evecs.real[:, idx[:self.rank_f]] + self.F = evecs[:, :self.rank_f] + + # Estimate PLDA model by iterating the EM algorithm + for it in range(self.nb_iter): + + # E-step + + # Copy stats as they will be whitened with a different Sigma for each iteration + local_stat = copy.deepcopy(model_shifted_stat) + + # Whiten statistics (with the new mean and Sigma) + local_stat.whiten_stats(self.mean, self.Sigma) + + # Whiten the EigenVoice matrix + eigen_values, eigen_vectors = linalg.eigh(self.Sigma) + ind = eigen_values.real.argsort()[::-1] + eigen_values = eigen_values.real[ind] + eigen_vectors = eigen_vectors.real[:, ind] + sqr_inv_eval_sigma = 1 / numpy.sqrt(eigen_values.real) + sqr_inv_sigma = numpy.dot(eigen_vectors, + numpy.diag(sqr_inv_eval_sigma)) + self.F = sqr_inv_sigma.T.dot(self.F) + + # Replicate self.stat0 + index_map = numpy.zeros(vect_size, dtype=int) + _stat0 = local_stat.stat0[:, index_map] + + e_h = numpy.zeros((class_nb, self.rank_f)) + e_hh = numpy.zeros((class_nb, self.rank_f, self.rank_f)) + + # loop on model id's + fa_model_loop( + batch_start=0, + mini_batch_indices=numpy.arange(class_nb), + factor_analyser=self, + stat0=_stat0, + stats=local_stat.stats, + e_h=e_h, + e_hh=e_hh, ) + + # Accumulate for minimum divergence step + _R = numpy.sum(e_hh, axis=0) / session_per_model.shape[0] + + _C = e_h.T.dot(local_stat.stats).dot(linalg.inv(sqr_inv_sigma)) + _A = numpy.einsum("ijk,i->jk", e_hh, local_stat.stat0.squeeze()) + + # M-step + self.F = linalg.solve(_A, _C).T + + # Update the residual covariance + self.Sigma = sigma_obs - self.F.dot(_C) / session_per_model.sum() + + # Minimum Divergence step + self.F = self.F.dot(linalg.cholesky(_R)) + + +if __name__ == '__main__': + import random + + dim, N, n_spkrs = 10, 100, 10 + train_xv = numpy.random.rand(N, dim) + md = ['md' + str(random.randrange(1, n_spkrs, 1)) for i in range(N)] # spk + modelset = numpy.array(md, dtype="|O") + sg = ['sg' + str(i) for i in range(N)] # utt + segset = numpy.array(sg, dtype="|O") + stat0 = numpy.array([[1.0]] * N) + xvectors_stat = EmbeddingMeta( + modelset=modelset, segset=segset, stats=train_xv) + # Training PLDA model: M ~ (mean, F, Sigma) + plda = PLDA(rank_f=5) + plda.plda(xvectors_stat) + print(plda.mean.shape) #(10,) + print(plda.F.shape) #(10, 5) + print(plda.Sigma.shape) #(10, 10) + # Enrollment (20 utts), + en_N = 20 + en_xv = numpy.random.rand(en_N, dim) + en_sgs = ['en' + str(i) for i in range(en_N)] + en_sets = numpy.array(en_sgs, dtype="|O") + en_stat = EmbeddingMeta(modelset=en_sets, segset=en_sets, stats=en_xv) + # Test (30 utts) + te_N = 30 + te_xv = numpy.random.rand(te_N, dim) + te_sgs = ['te' + str(i) for i in range(te_N)] + te_sets = numpy.array(te_sgs, dtype="|O") + te_stat = EmbeddingMeta(modelset=te_sets, segset=te_sets, stats=te_xv) + ndx = Ndx(models=en_sets, testsegs=te_sets) + # PLDA Scoring + scores_plda = fast_PLDA_scoring(en_stat, te_stat, ndx, plda.mean, plda.F, + plda.Sigma) + print(scores_plda.scoremat.shape) #(20, 30) From 44c66234487da39ff4886656d284737bb41f375b Mon Sep 17 00:00:00 2001 From: qingen Date: Sun, 10 Apr 2022 22:08:17 +0800 Subject: [PATCH 02/18] [vec][score] update plda model, test=doc fix #1667 --- paddlespeech/vector/cluster/plda.py | 210 +++++++++++++--------------- 1 file changed, 100 insertions(+), 110 deletions(-) diff --git a/paddlespeech/vector/cluster/plda.py b/paddlespeech/vector/cluster/plda.py index 474a70e2..78a02a32 100644 --- a/paddlespeech/vector/cluster/plda.py +++ b/paddlespeech/vector/cluster/plda.py @@ -299,114 +299,6 @@ def _check_missing_model(enroll, test, ndx): return clean_ndx -def fast_PLDA_scoring( - enroll, - test, - ndx, - mu, - F, - Sigma, - test_uncertainty=None, - Vtrans=None, - p_known=0.0, - scaling_factor=1.0, - check_missing=True, ): - """ - Compute the PLDA scores between to sets of vectors. The list of - trials to perform is given in an Ndx object. PLDA matrices have to be - pre-computed. i-vectors/x-vectors are supposed to be whitened before. - - Arguments - --------- - enroll : speechbrain.utils.Xvector_PLDA_sp.StatObject_SB - A StatServer in which stat1 are xvectors. - test : speechbrain.utils.Xvector_PLDA_sp.StatObject_SB - A StatServer in which stat1 are xvectors. - ndx : speechbrain.utils.Xvector_PLDA_sp.Ndx - An Ndx object defining the list of trials to perform. - mu : double - The mean vector of the PLDA gaussian. - F : tensor - The between-class co-variance matrix of the PLDA. - Sigma: tensor - The residual covariance matrix. - p_known : float - Probability of having a known speaker for open-set - identification case (=1 for the verification task and =0 for the - closed-set case). - check_missing : bool - If True, check that all models and segments exist. - """ - - enroll_ctr = copy.deepcopy(enroll) - test_ctr = copy.deepcopy(test) - - # Remove missing models and test segments - if check_missing: - clean_ndx = _check_missing_model(enroll_ctr, test_ctr, ndx) - else: - clean_ndx = ndx - - # Center the i-vectors around the PLDA mean - enroll_ctr.center_stats(mu) - test_ctr.center_stats(mu) - - # Compute constant component of the PLDA distribution - invSigma = linalg.inv(Sigma) - I_spk = numpy.eye(F.shape[1], dtype="float") - - K = F.T.dot(invSigma * scaling_factor).dot(F) - K1 = linalg.inv(K + I_spk) - K2 = linalg.inv(2 * K + I_spk) - - # Compute the Gaussian distribution constant - alpha1 = numpy.linalg.slogdet(K1)[1] - alpha2 = numpy.linalg.slogdet(K2)[1] - plda_cst = alpha2 / 2.0 - alpha1 - - # Compute intermediate matrices - Sigma_ac = numpy.dot(F, F.T) - Sigma_tot = Sigma_ac + Sigma - Sigma_tot_inv = linalg.inv(Sigma_tot) - - Tmp = linalg.inv(Sigma_tot - Sigma_ac.dot(Sigma_tot_inv).dot(Sigma_ac)) - Phi = Sigma_tot_inv - Tmp - Psi = Sigma_tot_inv.dot(Sigma_ac).dot(Tmp) - - # Compute the different parts of PLDA score - model_part = 0.5 * numpy.einsum("ij, ji->i", - enroll_ctr.stats.dot(Phi), - enroll_ctr.stats.T) - seg_part = 0.5 * numpy.einsum("ij, ji->i", - test_ctr.stats.dot(Phi), test_ctr.stats.T) - - # Compute verification scores - score = Scores() # noqa F821 - score.modelset = clean_ndx.modelset - score.segset = clean_ndx.segset - score.scoremask = clean_ndx.trialmask - - score.scoremat = model_part[:, numpy.newaxis] + seg_part + plda_cst - score.scoremat += enroll_ctr.stats.dot(Psi).dot(test_ctr.stats.T) - score.scoremat *= scaling_factor - - # Case of open-set identification, we compute the log-likelihood - # by taking into account the probability of having a known impostor - # or an out-of set class - if p_known != 0: - N = score.scoremat.shape[0] - open_set_scores = numpy.empty(score.scoremat.shape) - tmp = numpy.exp(score.scoremat) - for ii in range(N): - # open-set term - open_set_scores[ii, :] = score.scoremat[ii, :] - numpy.log( - p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / ( - N - 1) + (1 - p_known)) - score.scoremat = open_set_scores - - return score - - class PLDA: """ A class to train PLDA model from embeddings. @@ -547,6 +439,105 @@ class PLDA: # Minimum Divergence step self.F = self.F.dot(linalg.cholesky(_R)) + def scoring( + self, + enroll, + test, + ndx, + test_uncertainty=None, + Vtrans=None, + p_known=0.0, + scaling_factor=1.0, + check_missing=True, ): + """ + Compute the PLDA scores between to sets of vectors. The list of + trials to perform is given in an Ndx object. PLDA matrices have to be + pre-computed. i-vectors/x-vectors are supposed to be whitened before. + + Arguments + --------- + enroll : paddlespeech.vector.cluster.diarization.EmbeddingMeta + A EmbeddingMeta in which stats are xvectors. + test : paddlespeech.vector.cluster.diarization.EmbeddingMeta + A EmbeddingMeta in which stats are xvectors. + ndx : paddlespeech.vector.cluster.plda.Ndx + An Ndx object defining the list of trials to perform. + p_known : float + Probability of having a known speaker for open-set + identification case (=1 for the verification task and =0 for the + closed-set case). + check_missing : bool + If True, check that all models and segments exist. + """ + + enroll_ctr = copy.deepcopy(enroll) + test_ctr = copy.deepcopy(test) + + # Remove missing models and test segments + if check_missing: + clean_ndx = _check_missing_model(enroll_ctr, test_ctr, ndx) + else: + clean_ndx = ndx + + # Center the i-vectors around the PLDA mean + enroll_ctr.center_stats(self.mean) + test_ctr.center_stats(self.mean) + + # Compute constant component of the PLDA distribution + invSigma = linalg.inv(self.Sigma) + I_spk = numpy.eye(self.F.shape[1], dtype="float") + + K = self.F.T.dot(invSigma * scaling_factor).dot(self.F) + K1 = linalg.inv(K + I_spk) + K2 = linalg.inv(2 * K + I_spk) + + # Compute the Gaussian distribution constant + alpha1 = numpy.linalg.slogdet(K1)[1] + alpha2 = numpy.linalg.slogdet(K2)[1] + plda_cst = alpha2 / 2.0 - alpha1 + + # Compute intermediate matrices + Sigma_ac = numpy.dot(self.F, self.F.T) + Sigma_tot = Sigma_ac + self.Sigma + Sigma_tot_inv = linalg.inv(Sigma_tot) + + Tmp = linalg.inv(Sigma_tot - Sigma_ac.dot(Sigma_tot_inv).dot(Sigma_ac)) + Phi = Sigma_tot_inv - Tmp + Psi = Sigma_tot_inv.dot(Sigma_ac).dot(Tmp) + + # Compute the different parts of PLDA score + model_part = 0.5 * numpy.einsum("ij, ji->i", + enroll_ctr.stats.dot(Phi), + enroll_ctr.stats.T) + seg_part = 0.5 * numpy.einsum("ij, ji->i", + test_ctr.stats.dot(Phi), test_ctr.stats.T) + + # Compute verification scores + score = Scores() # noqa F821 + score.modelset = clean_ndx.modelset + score.segset = clean_ndx.segset + score.scoremask = clean_ndx.trialmask + + score.scoremat = model_part[:, numpy.newaxis] + seg_part + plda_cst + score.scoremat += enroll_ctr.stats.dot(Psi).dot(test_ctr.stats.T) + score.scoremat *= scaling_factor + + # Case of open-set identification, we compute the log-likelihood + # by taking into account the probability of having a known impostor + # or an out-of set class + if p_known != 0: + N = score.scoremat.shape[0] + open_set_scores = numpy.empty(score.scoremat.shape) + tmp = numpy.exp(score.scoremat) + for ii in range(N): + # open-set term + open_set_scores[ii, :] = score.scoremat[ii, :] - numpy.log( + p_known * tmp[~(numpy.arange(N) == ii)].sum(axis=0) / ( + N - 1) + (1 - p_known)) + score.scoremat = open_set_scores + + return score + if __name__ == '__main__': import random @@ -580,6 +571,5 @@ if __name__ == '__main__': te_stat = EmbeddingMeta(modelset=te_sets, segset=te_sets, stats=te_xv) ndx = Ndx(models=en_sets, testsegs=te_sets) # PLDA Scoring - scores_plda = fast_PLDA_scoring(en_stat, te_stat, ndx, plda.mean, plda.F, - plda.Sigma) + scores_plda = plda.scoring(en_stat, te_stat, ndx) print(scores_plda.scoremat.shape) #(20, 30) From 8d9bd9a93a1e9f067321af24cbddd47ac92c22f5 Mon Sep 17 00:00:00 2001 From: qingen Date: Mon, 11 Apr 2022 10:56:21 +0800 Subject: [PATCH 03/18] [vec][score] update Copyright, test=doc fix #1667 --- paddlespeech/vector/cluster/diarization.py | 2 +- paddlespeech/vector/cluster/plda.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddlespeech/vector/cluster/diarization.py b/paddlespeech/vector/cluster/diarization.py index c3dbb120..9fda019e 100644 --- a/paddlespeech/vector/cluster/diarization.py +++ b/paddlespeech/vector/cluster/diarization.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 SpeechBrain Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle and SpeechBrain Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlespeech/vector/cluster/plda.py b/paddlespeech/vector/cluster/plda.py index 78a02a32..81def435 100644 --- a/paddlespeech/vector/cluster/plda.py +++ b/paddlespeech/vector/cluster/plda.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 SpeechBrain Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle and SpeechBrain Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -569,7 +569,7 @@ if __name__ == '__main__': te_sgs = ['te' + str(i) for i in range(te_N)] te_sets = numpy.array(te_sgs, dtype="|O") te_stat = EmbeddingMeta(modelset=te_sets, segset=te_sets, stats=te_xv) - ndx = Ndx(models=en_sets, testsegs=te_sets) + ndx = Ndx(models=en_sets, testsegs=te_sets) # trials # PLDA Scoring scores_plda = plda.scoring(en_stat, te_stat, ndx) print(scores_plda.scoremat.shape) #(20, 30) From 82992b3ed6eaffd78fa27fae57235488f2ded168 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 11 Apr 2022 11:00:04 +0800 Subject: [PATCH 04/18] add test code, test=doc --- .../server/tests/tts/infer/csmsc_test.txt | 100 +++ paddlespeech/server/tests/tts/infer/run.sh | 64 ++ .../server/tests/tts/infer/test_online_tts.py | 650 ++++++++++++++++++ 3 files changed, 814 insertions(+) create mode 100644 paddlespeech/server/tests/tts/infer/csmsc_test.txt create mode 100644 paddlespeech/server/tests/tts/infer/run.sh create mode 100644 paddlespeech/server/tests/tts/infer/test_online_tts.py diff --git a/paddlespeech/server/tests/tts/infer/csmsc_test.txt b/paddlespeech/server/tests/tts/infer/csmsc_test.txt new file mode 100644 index 00000000..d8cf367c --- /dev/null +++ b/paddlespeech/server/tests/tts/infer/csmsc_test.txt @@ -0,0 +1,100 @@ +009901 昨日,这名伤者与医生全部被警方依法刑事拘留。 +009902 钱伟长想到上海来办学校是经过深思熟虑的。 +009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。 +009904 李述德在离开之前,只说了一句柱驼杀父亲了。 +009905 这种车票和保险单捆绑出售属于重复性购买。 +009906 戴佩妮的男友西米露接唱情歌,让她非常开心。 +009907 观大势,谋大局,出大策始终是该院的办院方针。 +009908 他们骑着摩托回家,正好为农忙时的父母帮忙。 +009909 但是因为还没到退休年龄,只能掰着指头捱日子。 +009910 这几天雨水不断,人们恨不得待在家里不出门。 +009911 没想到徐赟,张海翔两人就此玩起了人间蒸发。 +009912 藤村此番发言可能是为了凸显野田的领导能力。 +009913 程长庚,生在清王朝嘉庆年间,安徽的潜山小县。 +009914 南海海域综合补给基地码头项目正在论证中。 +009915 也就是说今晚成都市民极有可能再次看到飘雪。 +009916 随着天气转热,各地的游泳场所开始人头攒动。 +009917 更让徐先生纳闷的是,房客的手机也打不通了。 +009918 遇到颠簸时,应听从乘务员的安全指令,回座位坐好。 +009919 他在后面呆惯了,怕自己一插身后的人会不满,不敢排进去。 +009920 傍晚七个小人回来了,白雪公主说,你们就是我命中的七个小矮人吧。 +009921 他本想说,教育局管这个,他们是一路的,这样一管岂不是妓女起嫖客? +009922 一种表示商品所有权的财物证券,也称商品证券,如提货单,交货单。 +009923 会有很丰富的东西留下来,说都说不完。 +009924 这句话像从天而降,吓得四周一片寂静。 +009925 记者所在的是受害人家属所在的右区。 +009926 不管哈大爷去哪,它都一步不离地跟着。 +009927 大家抬头望去,一只老鼠正趴在吊顶上。 +009928 我决定过年就辞职,接手我爸的废品站! +009929 最终,中国男子乒乓球队获得此奖项。 +009930 防汛抗旱两手抓,抗旱相对抓的不够。 +009931 图们江下游地区开发开放的进展如何? +009932 这要求中国必须有一个坚强的政党领导。 +009933 再说,关于利益上的事俺俩都不好开口。 +009934 明代瓦剌,鞑靼入侵明境也是通过此地。 +009935 咪咪舔着孩子,把它身上的毛舔干净。 +009936 是否这次的国标修订被大企业绑架了? +009937 判决后,姚某妻子胡某不服,提起上诉。 +009938 由此可以看出邯钢的经济效益来自何处。 +009939 琳达说,是瑜伽改变了她和马儿的生活。 +009940 楼下的保安告诉记者,这里不租也不卖。 +009941 习近平说,中斯两国人民传统友谊深厚。 +009942 传闻越来越多,后来连老汉儿自己都怕了。 +009943 我怒吼一声冲上去,举起砖头砸了过去。 +009944 我现在还不会,这就回去问问发明我的人。 +009945 显然,洛阳性奴案不具备上述两个前提。 +009946 另外,杰克逊有文唇线,眼线,眉毛的动作。 +009947 昨晚,华西都市报记者电话采访了尹琪。 +009948 涅拉季科未透露这些航空公司的名称。 +009949 从运行轨迹上来说,它也不可能是星星。 +009950 目前看,如果继续加息也存在两难问题。 +009951 曾宝仪在节目录制现场大爆观众糗事。 +009952 但任凭周某怎么叫,男子仍酣睡不醒。 +009953 老大爷说,小子,你挡我财路了,知道不? +009954 没料到,闯下大头佛的阿伟还不知悔改。 +009955 卡扎菲部落式统治已遭遇部落内讧。 +009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。 +009957 出现这种泥鳅内阁的局面既是野田有意为之,也实属无奈。 +009958 济青高速济南,华山,章丘,邹平,周村,淄博,临淄站。 +009959 赵凌飞的话,反映了沈阳赛区所有奥运志愿者的共同心声。 +009960 因为,我们所发出的力量必会因难度加大而减弱。 +009961 发生事故的楼梯拐角处仍可看到血迹。 +009962 想过进公安,可能身高不够,老汉儿也不让我进去。 +009963 路上关卡很多,为了方便撤离,只好轻装前进。 +009964 原来比尔盖茨就是美国微软公司联合创始人呀。 +009965 之后他们一家三口将与双方父母往峇里岛旅游。 +009966 谢谢总理,也感谢广大网友的参与,我们明年再见。 +009967 事实上是,从来没有一个欺善怕恶的人能作出过稍大一点的成就。 +009968 我会打开邮件,你可以从那里继续。 +009969 美方对近期东海局势表示关切。 +009970 据悉,奥巴马一家人对这座冬季白宫极为满意。 +009971 打扫完你会很有成就感的,试一试,你就信了。 +009972 诺曼站在滑板车上,各就各位,准备出发啦! +009973 塔河的寒夜,气温降到了零下三十多摄氏度。 +009974 其间,连破六点六,六点五,六点四,六点三五等多个重要关口。 +009975 算命其实只是人们的一种自我安慰和自我暗示而已,我们还是要相信科学才好。 +009976 这一切都令人欢欣鼓舞,阿讷西没理由不坚持到最后。 +009977 直至公元前一万一千年,它又再次出现。 +009978 尽量少玩电脑,少看电视,少打游戏。 +009979 从五到七,前后也就是六个月的时间。 +009980 一进咖啡店,他就遇见一张熟悉的脸。 +009981 好在众弟兄看到了把她追了回来。 +009982 有一个人说,哥们儿我们跑过它才能活。 +009983 捅了她以后,模糊记得她没咋动了。 +009984 从小到大,葛启义没有收到过压岁钱。 +009985 舞台下的你会对舞台上的你说什么? +009986 但考生普遍认为,试题的怪多过难。 +009987 我希望每个人都能够尊重我们的隐私。 +009988 漫天的红霞使劲给两人增添气氛。 +009989 晚上加完班开车回家,太累了,迷迷糊糊开着车,走一半的时候,铛一声! +009990 该车将三人撞倒后,在大雾中逃窜。 +009991 这人一哆嗦,方向盘也把不稳了,差点撞上了高速边道护栏。 +009992 那女孩儿委屈的说,我一回头见你已经进去了我不敢进去啊! +009993 小明摇摇头说,不是,我只是美女看多了,想换个口味而已。 +009994 接下来,红娘要求记者交费,记者表示不知表姐身份证号码。 +009995 李东蓊表示,自己当时在法庭上发表了一次独特的公诉意见。 +009996 另一男子扑了上来,手里拿着明晃晃的长刀,向他胸口直刺。 +009997 今天,快递员拿着一个快递在办公室喊,秦王是哪个,有他快递? +009998 这场抗议活动究竟是如何发展演变的,又究竟是谁伤害了谁? +009999 因华国锋肖鸡,墓地设计根据其属相设计。 +010000 在狱中,张明宝悔恨交加,写了一份忏悔书。 diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh new file mode 100644 index 00000000..fdceec41 --- /dev/null +++ b/paddlespeech/server/tests/tts/infer/run.sh @@ -0,0 +1,64 @@ +model_path=/home/users/liangyunming/.paddlespeech/models/ +#am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_nosil_baker_ckpt_0.4/ ## fastspeech2 +am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ ## fastspeech2_cnn +voc_model_dir=$model_path/hifigan_csmsc-zh/hifigan_csmsc_ckpt_0.1.1/ ## hifigan +#voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ ## mb_melgan + +if [[ $am_model_dir == *"fastspeech2_cnndecoder"* ]]; then + am_support_stream=True +else + am_support_stream=False +fi + +# get am file +for file in $(ls $am_model_dir) +do + if [[ $file == *"yaml"* ]]; then + am_config_file=$file + elif [[ $file == *"pdz"* ]]; then + am_ckpt_file=$file + elif [[ $file == *"stat"* ]]; then + am_stat_file=$file + elif [[ $file == *"phone"* ]]; then + phones_dict_file=$file + fi + +done + +# get voc file +for file in $(ls $voc_model_dir) +do + if [[ $file == *"yaml"* ]]; then + voc_config_file=$file + elif [[ $file == *"pdz"* ]]; then + voc_ckpt_file=$file + elif [[ $file == *"stat"* ]]; then + voc_stat_file=$file + fi + +done + + +#run +python test_online_tts.py --am fastspeech2_csmsc \ + --am_support_stream $am_support_stream \ + --am_config $am_model_dir/$am_config_file \ + --am_ckpt $am_model_dir/$am_ckpt_file \ + --am_stat $am_model_dir/$am_stat_file \ + --phones_dict $am_model_dir/$phones_dict_file \ + --voc hifigan_csmsc \ + --voc_config $voc_model_dir/$voc_config_file \ + --voc_ckpt $voc_model_dir/$voc_ckpt_file \ + --voc_stat $voc_model_dir/$voc_stat_file \ + --lang zh \ + --device cpu \ + --text ./csmsc_test.txt \ + --output_dir ./output \ + --log_file ./result.log \ + --am_streaming False \ + --am_pad 12 \ + --am_block 42 \ + --voc_streaming True \ + --voc_pad 14 \ + --voc_block 14 \ + diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py new file mode 100644 index 00000000..17ac0ea7 --- /dev/null +++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py @@ -0,0 +1,650 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import logging +import math +import threading +import time +from pathlib import Path + +import numpy as np +import paddle +import soundfile as sf +import yaml +from yacs.config import CfgNode + +from paddlespeech.s2t.utils.dynamic_import import dynamic_import +from paddlespeech.t2s.exps.syn_utils import get_am_inference +from paddlespeech.t2s.exps.syn_utils import get_frontend +from paddlespeech.t2s.exps.syn_utils import get_sentences +from paddlespeech.t2s.exps.syn_utils import get_voc_inference +from paddlespeech.t2s.exps.syn_utils import model_alias +from paddlespeech.t2s.utils import str2bool + +mel_streaming = None +wav_streaming = None +stream_first_time = 0.0 +voc_stream_st = 0.0 +sample_rate = 0 + + +def denorm(data, mean, std): + return data * std + mean + + +def get_chunks(data, block_size, pad_size, step): + if step == "am": + data_len = data.shape[1] + elif step == "voc": + data_len = data.shape[0] + else: + print("Please set correct type to get chunks, am or voc") + + chunks = [] + n = math.ceil(data_len / block_size) + for i in range(n): + start = max(0, i * block_size - pad_size) + end = min((i + 1) * block_size + pad_size, data_len) + if step == "am": + chunks.append(data[:, start:end, :]) + elif step == "voc": + chunks.append(data[start:end, :]) + else: + print("Please set correct type to get chunks, am or voc") + return chunks + + +def get_stream_am_inference(args, am_config): + with open(args.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + vocab_size = len(phn_id) + print("vocab_size:", vocab_size) + + am_name = args.am[:args.am.rindex('_')] + am_dataset = args.am[args.am.rindex('_') + 1:] + odim = am_config.n_mels + + am_class = dynamic_import(am_name, model_alias) + am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) + am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) + am.eval() + am_mu, am_std = np.load(args.am_stat) + am_mu = paddle.to_tensor(am_mu) + am_std = paddle.to_tensor(am_std) + + return am, am_mu, am_std + + +def init(args): + global sample_rate + # get config + with open(args.am_config) as f: + am_config = CfgNode(yaml.safe_load(f)) + with open(args.voc_config) as f: + voc_config = CfgNode(yaml.safe_load(f)) + + sample_rate = am_config.fs + + # frontend + frontend = get_frontend(args) + + # acoustic model + if args.am_support_stream: + am, am_mu, am_std = get_stream_am_inference(args, am_config) + am_infer_info = [am, am_mu, am_std, am_config] + else: + am_inference, am_name, am_dataset = get_am_inference(args, am_config) + am_infer_info = [am_inference, am_name, am_dataset, am_config] + + # vocoder + voc_inference = get_voc_inference(args, voc_config) + voc_infer_info = [voc_inference, voc_config] + + return frontend, am_infer_info, voc_infer_info + + +def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): + am_name = args.am[:args.am.rindex('_')] + tone_ids = None + if am_name == 'speedyspeech': + get_tone_ids = True + + if args.lang == 'zh': + input_ids = frontend.get_input_ids( + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + elif args.lang == 'en': + input_ids = frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + + return phone_ids, tone_ids + + +@paddle.no_grad() +# 生成完整的mel +def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): + # 如果是支持流式的AM模型 + if args.am_support_stream: + am, am_mu, am_std, am_config = am_infer_info + orig_hs, h_masks = am.encoder_infer(part_phone_ids) + if args.am_streaming: + am_pad = args.am_pad + am_block = args.am_block + hss = get_chunks(orig_hs, am_block, am_pad, "am") + chunk_num = len(hss) + mel_list = [] + for i, hs in enumerate(hss): + before_outs, _ = am.decoder(hs) + after_outs = before_outs + am.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + sub_mel = denorm(normalized_mel, am_mu, am_std) + # clip output part of pad + if i == 0: + sub_mel = sub_mel[:-am_pad] + elif i == chunk_num - 1: + # 最后一块的右侧一定没有 pad 够 + sub_mel = sub_mel[am_pad:] + else: + # 倒数几块的右侧也可能没有 pad 够 + sub_mel = sub_mel[am_pad:(am_block + am_pad) - + sub_mel.shape[0]] + mel_list.append(sub_mel) + mel = paddle.concat(mel_list, axis=0) + + else: + orig_hs, h_masks = am.encoder_infer(part_phone_ids) + before_outs, _ = am.decoder(orig_hs) + after_outs = before_outs + am.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + mel = denorm(normalized_mel, am_mu, am_std) + + else: + am_inference, am_name, am_dataset, am_config = am_infer_info + # acoustic model + if am_name == 'fastspeech2': + # multi speaker + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, spk_id) + else: + mel = am_inference(part_phone_ids) + elif am_name == 'speedyspeech': + part_tone_ids = tone_ids[i] + if am_dataset in {"aishell3", "vctk"}: + spk_id = paddle.to_tensor(args.spk_id) + mel = am_inference(part_phone_ids, part_tone_ids, spk_id) + else: + mel = am_inference(part_phone_ids, part_tone_ids) + elif am_name == 'tacotron2': + mel = am_inference(part_phone_ids) + + return mel + + +@paddle.no_grad() +def stream_voc_infer(args, voc_infer_info, mel_len): + global mel_streaming + global stream_first_time + global wav_streaming + voc_inference, voc_config = voc_infer_info + block = args.voc_block + pad = args.voc_pad + upsample = voc_config.n_shift + wav_list = [] + flag = 1 + + valid_start = 0 + valid_end = min(valid_start + block, mel_len) + actual_start = 0 + actual_end = min(valid_end + pad, mel_len) + mel_chunk = mel_streaming[actual_start:actual_end, :] + + while valid_end <= mel_len: + sub_wav = voc_inference(mel_chunk) + if flag == 1: + stream_first_time = time.time() + flag = 0 + + # get valid wav + start = valid_start - actual_start + if valid_end == mel_len: + sub_wav = sub_wav[start * upsample:] + wav_list.append(sub_wav) + break + else: + end = start + block + sub_wav = sub_wav[start * upsample:end * upsample] + wav_list.append(sub_wav) + + # generate new mel chunk + valid_start = valid_end + valid_end = min(valid_start + block, mel_len) + if valid_start - pad < 0: + actual_start = 0 + else: + actual_start = valid_start - pad + actual_end = min(valid_end + pad, mel_len) + mel_chunk = mel_streaming[actual_start:actual_end, :] + + wav = paddle.concat(wav_list, axis=0) + wav_streaming = wav + + +@paddle.no_grad() +# 非流式AM / 流式AM + 非流式Voc +def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, + part_tone_ids): + mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) + am_infer_time = time.time() + voc_inference, voc_config = voc_infer_info + wav = voc_inference(mel) + first_response_time = time.time() + final_response_time = first_response_time + voc_infer_time = first_response_time + + return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav + + +@paddle.no_grad() +# 非流式AM + 流式Voc +def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, + part_tone_ids): + global mel_streaming + global stream_first_time + global wav_streaming + + mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) + am_infer_time = time.time() + + # voc streaming + mel_streaming = mel + mel_len = mel.shape[0] + stream_voc_infer(args, voc_infer_info, mel_len) + first_response_time = stream_first_time + wav = wav_streaming + final_response_time = time.time() + voc_infer_time = final_response_time + + return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav + + +@paddle.no_grad() +# 流式AM + 流式 Voc +def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, + part_tone_ids): + global mel_streaming + global stream_first_time + global wav_streaming + global voc_stream_st + mel_streaming = None + flag = 1 #用来表示开启流式voc的线程 + + am, am_mu, am_std, am_config = am_infer_info + orig_hs, h_masks = am.encoder_infer(part_phone_ids) + mel_len = orig_hs.shape[1] + am_block = args.am_block + am_pad = args.am_pad + hss = get_chunks(orig_hs, am_block, am_pad, "am") + chunk_num = len(hss) + + for i, hs in enumerate(hss): + before_outs, _ = am.decoder(hs) + after_outs = before_outs + am.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + sub_mel = denorm(normalized_mel, am_mu, am_std) + # clip output part of pad + if i == 0: + sub_mel = sub_mel[:-am_pad] + mel_streaming = sub_mel + elif i == chunk_num - 1: + # 最后一块的右侧一定没有 pad 够 + sub_mel = sub_mel[am_pad:] + mel_streaming = paddle.concat([mel_streaming, sub_mel]) + am_infer_time = time.time() + else: + # 倒数几块的右侧也可能没有 pad 够 + sub_mel = sub_mel[am_pad:(am_block + am_pad) - sub_mel.shape[0]] + mel_streaming = paddle.concat([mel_streaming, sub_mel]) + + if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad: + t = threading.Thread( + target=stream_voc_infer, args=(args, voc_infer_info, mel_len, )) + t.start() + voc_stream_st = time.time() + flag = 0 + + t.join() + final_response_time = time.time() + voc_infer_time = final_response_time + first_response_time = stream_first_time + wav = wav_streaming + + return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav + + +def try_infer(args, logger, frontend, am_infer_info, voc_infer_info): + global sample_rate + logger.info( + "Before the formal test, we test a few texts to make the inference speed more stable." + ) + if args.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if args.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + + if args.voc_streaming: + if args.am_streaming: + infer_func = stream_am_stream_voc + else: + infer_func = nostream_am_stream_voc + else: + infer_func = am_nostream_voc + + merge_sentences = True + get_tone_ids = False + for i in range(3): # 推理3次 + st = time.time() + phone_ids, tone_ids = get_phone(args, frontend, sentence, + merge_sentences, get_tone_ids) + part_phone_ids = phone_ids[0] + if tone_ids: + part_tone_ids = tone_ids[0] + else: + part_tone_ids = None + + am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func( + args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) + wav = wav.numpy() + duration = wav.size / sample_rate + logger.info( + f"sentence: {sentence}; duration: {duration} s; first response time: {first_response_time - st} s; final response time: {final_response_time - st} s" + ) + + +def evaluate(args, logger, frontend, am_infer_info, voc_infer_info): + global sample_rate + sentences = get_sentences(args) + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + get_tone_ids = False + merge_sentences = True + + # choose infer function + if args.voc_streaming: + if args.am_streaming: + infer_func = stream_am_stream_voc + else: + infer_func = nostream_am_stream_voc + else: + infer_func = am_nostream_voc + + final_up_duration = 0.0 + sentence_count = 0 + front_time_list = [] + am_time_list = [] + voc_time_list = [] + first_response_list = [] + final_response_list = [] + sentence_length_list = [] + duration_list = [] + + for utt_id, sentence in sentences: + # front + front_st = time.time() + phone_ids, tone_ids = get_phone(args, frontend, sentence, + merge_sentences, get_tone_ids) + part_phone_ids = phone_ids[0] + if tone_ids: + part_tone_ids = tone_ids[0] + else: + part_tone_ids = None + front_et = time.time() + front_time = front_et - front_st + + am_st = time.time() + am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func( + args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) + am_time = am_infer_time - am_st + if args.voc_streaming and args.am_streaming: + voc_time = voc_infer_time - voc_stream_st + else: + voc_time = voc_infer_time - am_infer_time + + first_response = first_response_time - front_st + final_response = final_response_time - front_st + + wav = wav.numpy() + duration = wav.size / sample_rate + sf.write( + str(output_dir / (utt_id + ".wav")), wav, samplerate=sample_rate) + print(f"{utt_id} done!") + + sentence_count += 1 + front_time_list.append(front_time) + am_time_list.append(am_time) + voc_time_list.append(voc_time) + first_response_list.append(first_response) + final_response_list.append(final_response) + sentence_length_list.append(len(sentence)) + duration_list.append(duration) + + logger.info( + f"uttid: {utt_id}; sentence: '{sentence}'; front time: {front_time} s; am time: {am_time} s; voc time: {voc_time} s; \ + first response time: {first_response} s; final response time: {final_response} s; audio duration: {duration} s;" + ) + + if final_response > duration: + final_up_duration += 1 + + all_time_sum = sum(final_response_list) + front_rate = sum(front_time_list) / all_time_sum + am_rate = sum(am_time_list) / all_time_sum + voc_rate = sum(voc_time_list) / all_time_sum + rtf = all_time_sum / sum(duration_list) + + logger.info( + f"The length of test text information, test num: {sentence_count}; text num: {sum(sentence_length_list)}; min: {min(sentence_length_list)}; max: {max(sentence_length_list)}; avg: {sum(sentence_length_list)/len(sentence_length_list)}" + ) + logger.info( + f"duration information, min: {min(duration_list)}; max: {max(duration_list)}; avg: {sum(duration_list) / len(duration_list)}; sum: {sum(duration_list)}" + ) + logger.info( + f"Front time information: min: {min(front_time_list)} s; max: {max(front_time_list)} s; avg: {sum(front_time_list)/len(front_time_list)} s; ratio: {front_rate * 100}%" + ) + logger.info( + f"AM time information: min: {min(am_time_list)} s; max: {max(am_time_list)} s; avg: {sum(am_time_list)/len(am_time_list)} s; ratio: {am_rate * 100}%" + ) + logger.info( + f"Vocoder time information: min: {min(voc_time_list)} s, max: {max(voc_time_list)} s; avg: {sum(voc_time_list)/len(voc_time_list)} s; ratio: {voc_rate * 100}%" + ) + logger.info( + f"first response time information: min: {min(first_response_list)} s; max: {max(first_response_list)} s; avg: {sum(first_response_list)/len(first_response_list)} s" + ) + logger.info( + f"final response time information: min: {min(final_response_list)} s; max: {max(final_response_list)} s; avg: {sum(final_response_list)/len(final_response_list)} s" + ) + logger.info(f"RTF is: {rtf}") + logger.info( + f"The number of final_response is greater than duration is {final_up_duration}, ratio: {final_up_duration / sentence_count}%" + ) + + +def parse_args(): + # parse args and config and redirect to train_sp + parser = argparse.ArgumentParser( + description="Synthesize with acoustic model & vocoder") + # acoustic model + parser.add_argument( + '--am', + type=str, + default='fastspeech2_csmsc', + choices=[ + 'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc', + 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', + 'tacotron2_csmsc', 'tacotron2_ljspeech' + ], + help='Choose acoustic model type of tts task.') + parser.add_argument( + '--am_support_stream', + type=str2bool, + default=False, + help='if am model is fastspeech2_csmsc, specify whether it supports streaming' + ) + parser.add_argument( + '--am_config', + type=str, + default=None, + help='Config of acoustic model. Use deault config when it is None.') + parser.add_argument( + '--am_ckpt', + type=str, + default=None, + help='Checkpoint file of acoustic model.') + parser.add_argument( + "--am_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training acoustic model." + ) + parser.add_argument( + "--phones_dict", type=str, default=None, help="phone vocabulary file.") + parser.add_argument( + "--tones_dict", type=str, default=None, help="tone vocabulary file.") + parser.add_argument( + "--speaker_dict", type=str, default=None, help="speaker id map file.") + parser.add_argument( + '--spk_id', + type=int, + default=0, + help='spk id for multi speaker acoustic model') + # vocoder + parser.add_argument( + '--voc', + type=str, + default='mb_melgan_csmsc', + choices=[ + 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', + 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc', + 'wavernn_csmsc' + ], + help='Choose vocoder type of tts task.') + parser.add_argument( + '--voc_config', + type=str, + default=None, + help='Config of voc. Use deault config when it is None.') + parser.add_argument( + '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') + parser.add_argument( + "--voc_stat", + type=str, + default=None, + help="mean and standard deviation used to normalize spectrogram when training voc." + ) + # other + parser.add_argument( + '--lang', + type=str, + default='zh', + choices=['zh', 'en'], + help='Choose model language. zh or en') + + parser.add_argument( + "--device", type=str, default='cpu', help="set cpu or gpu:id") + + parser.add_argument( + "--text", + type=str, + default="./csmsc_test.txt", + help="text to synthesize, a 'utt_id sentence' pair per line.") + parser.add_argument("--output_dir", type=str, help="output dir.") + parser.add_argument( + "--log_file", type=str, default="result.log", help="log file.") + + parser.add_argument( + "--am_streaming", + type=str2bool, + default=False, + help="whether use streaming acoustic model") + + parser.add_argument("--am_pad", type=int, default=12, help="am pad size.") + + parser.add_argument( + "--am_block", type=int, default=42, help="am block size.") + + parser.add_argument( + "--voc_streaming", + type=str2bool, + default=False, + help="whether use streaming vocoder model") + + parser.add_argument("--voc_pad", type=int, default=14, help="voc pad size.") + + parser.add_argument( + "--voc_block", type=int, default=14, help="voc block size.") + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + paddle.set_device(args.device) + if args.am_support_stream: + assert (args.am == 'fastspeech2_csmsc') + if args.am_streaming: + assert (args.am_support_stream and args.am == 'fastspeech2_csmsc') + if args.voc_streaming: + assert (args.voc == 'mb_melgan_csmsc' or args.voc == 'hifigan_csmsc') + + logger = logging.getLogger() + fhandler = logging.FileHandler(filename=args.log_file, mode='w') + formatter = logging.Formatter( + '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' + ) + fhandler.setFormatter(formatter) + logger.addHandler(fhandler) + logger.setLevel(logging.DEBUG) + + # set basic information + logger.info( + f"AM: {args.am}; Vocoder: {args.voc}; device: {args.device}; am streaming: {args.am_streaming}; voc streaming: {args.voc_streaming}" + ) + logger.info( + f"am pad size: {args.am_pad}; am block size: {args.am_block}; voc pad size: {args.voc_pad}; voc block size: {args.voc_block};" + ) + + # get information about model + frontend, am_infer_info, voc_infer_info = init(args) + logger.info( + "************************ try infer *********************************") + try_infer(args, logger, frontend, am_infer_info, voc_infer_info) + logger.info( + "************************ normal test *******************************") + evaluate(args, logger, frontend, am_infer_info, voc_infer_info) + + +if __name__ == "__main__": + main() From 4b111146dc959daac319879ba8d89fb9a3f24b75 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 11 Apr 2022 15:31:03 +0800 Subject: [PATCH 05/18] code format, test=doc --- .../server/tests/tts/infer/csmsc_test.txt | 100 ------------------ paddlespeech/server/tests/tts/infer/run.sh | 28 ++--- .../server/tests/tts/infer/test_online_tts.py | 71 +++---------- 3 files changed, 26 insertions(+), 173 deletions(-) delete mode 100644 paddlespeech/server/tests/tts/infer/csmsc_test.txt diff --git a/paddlespeech/server/tests/tts/infer/csmsc_test.txt b/paddlespeech/server/tests/tts/infer/csmsc_test.txt deleted file mode 100644 index d8cf367c..00000000 --- a/paddlespeech/server/tests/tts/infer/csmsc_test.txt +++ /dev/null @@ -1,100 +0,0 @@ -009901 昨日,这名伤者与医生全部被警方依法刑事拘留。 -009902 钱伟长想到上海来办学校是经过深思熟虑的。 -009903 她见我一进门就骂,吃饭时也骂,骂得我抬不起头。 -009904 李述德在离开之前,只说了一句柱驼杀父亲了。 -009905 这种车票和保险单捆绑出售属于重复性购买。 -009906 戴佩妮的男友西米露接唱情歌,让她非常开心。 -009907 观大势,谋大局,出大策始终是该院的办院方针。 -009908 他们骑着摩托回家,正好为农忙时的父母帮忙。 -009909 但是因为还没到退休年龄,只能掰着指头捱日子。 -009910 这几天雨水不断,人们恨不得待在家里不出门。 -009911 没想到徐赟,张海翔两人就此玩起了人间蒸发。 -009912 藤村此番发言可能是为了凸显野田的领导能力。 -009913 程长庚,生在清王朝嘉庆年间,安徽的潜山小县。 -009914 南海海域综合补给基地码头项目正在论证中。 -009915 也就是说今晚成都市民极有可能再次看到飘雪。 -009916 随着天气转热,各地的游泳场所开始人头攒动。 -009917 更让徐先生纳闷的是,房客的手机也打不通了。 -009918 遇到颠簸时,应听从乘务员的安全指令,回座位坐好。 -009919 他在后面呆惯了,怕自己一插身后的人会不满,不敢排进去。 -009920 傍晚七个小人回来了,白雪公主说,你们就是我命中的七个小矮人吧。 -009921 他本想说,教育局管这个,他们是一路的,这样一管岂不是妓女起嫖客? -009922 一种表示商品所有权的财物证券,也称商品证券,如提货单,交货单。 -009923 会有很丰富的东西留下来,说都说不完。 -009924 这句话像从天而降,吓得四周一片寂静。 -009925 记者所在的是受害人家属所在的右区。 -009926 不管哈大爷去哪,它都一步不离地跟着。 -009927 大家抬头望去,一只老鼠正趴在吊顶上。 -009928 我决定过年就辞职,接手我爸的废品站! -009929 最终,中国男子乒乓球队获得此奖项。 -009930 防汛抗旱两手抓,抗旱相对抓的不够。 -009931 图们江下游地区开发开放的进展如何? -009932 这要求中国必须有一个坚强的政党领导。 -009933 再说,关于利益上的事俺俩都不好开口。 -009934 明代瓦剌,鞑靼入侵明境也是通过此地。 -009935 咪咪舔着孩子,把它身上的毛舔干净。 -009936 是否这次的国标修订被大企业绑架了? -009937 判决后,姚某妻子胡某不服,提起上诉。 -009938 由此可以看出邯钢的经济效益来自何处。 -009939 琳达说,是瑜伽改变了她和马儿的生活。 -009940 楼下的保安告诉记者,这里不租也不卖。 -009941 习近平说,中斯两国人民传统友谊深厚。 -009942 传闻越来越多,后来连老汉儿自己都怕了。 -009943 我怒吼一声冲上去,举起砖头砸了过去。 -009944 我现在还不会,这就回去问问发明我的人。 -009945 显然,洛阳性奴案不具备上述两个前提。 -009946 另外,杰克逊有文唇线,眼线,眉毛的动作。 -009947 昨晚,华西都市报记者电话采访了尹琪。 -009948 涅拉季科未透露这些航空公司的名称。 -009949 从运行轨迹上来说,它也不可能是星星。 -009950 目前看,如果继续加息也存在两难问题。 -009951 曾宝仪在节目录制现场大爆观众糗事。 -009952 但任凭周某怎么叫,男子仍酣睡不醒。 -009953 老大爷说,小子,你挡我财路了,知道不? -009954 没料到,闯下大头佛的阿伟还不知悔改。 -009955 卡扎菲部落式统治已遭遇部落内讧。 -009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。 -009957 出现这种泥鳅内阁的局面既是野田有意为之,也实属无奈。 -009958 济青高速济南,华山,章丘,邹平,周村,淄博,临淄站。 -009959 赵凌飞的话,反映了沈阳赛区所有奥运志愿者的共同心声。 -009960 因为,我们所发出的力量必会因难度加大而减弱。 -009961 发生事故的楼梯拐角处仍可看到血迹。 -009962 想过进公安,可能身高不够,老汉儿也不让我进去。 -009963 路上关卡很多,为了方便撤离,只好轻装前进。 -009964 原来比尔盖茨就是美国微软公司联合创始人呀。 -009965 之后他们一家三口将与双方父母往峇里岛旅游。 -009966 谢谢总理,也感谢广大网友的参与,我们明年再见。 -009967 事实上是,从来没有一个欺善怕恶的人能作出过稍大一点的成就。 -009968 我会打开邮件,你可以从那里继续。 -009969 美方对近期东海局势表示关切。 -009970 据悉,奥巴马一家人对这座冬季白宫极为满意。 -009971 打扫完你会很有成就感的,试一试,你就信了。 -009972 诺曼站在滑板车上,各就各位,准备出发啦! -009973 塔河的寒夜,气温降到了零下三十多摄氏度。 -009974 其间,连破六点六,六点五,六点四,六点三五等多个重要关口。 -009975 算命其实只是人们的一种自我安慰和自我暗示而已,我们还是要相信科学才好。 -009976 这一切都令人欢欣鼓舞,阿讷西没理由不坚持到最后。 -009977 直至公元前一万一千年,它又再次出现。 -009978 尽量少玩电脑,少看电视,少打游戏。 -009979 从五到七,前后也就是六个月的时间。 -009980 一进咖啡店,他就遇见一张熟悉的脸。 -009981 好在众弟兄看到了把她追了回来。 -009982 有一个人说,哥们儿我们跑过它才能活。 -009983 捅了她以后,模糊记得她没咋动了。 -009984 从小到大,葛启义没有收到过压岁钱。 -009985 舞台下的你会对舞台上的你说什么? -009986 但考生普遍认为,试题的怪多过难。 -009987 我希望每个人都能够尊重我们的隐私。 -009988 漫天的红霞使劲给两人增添气氛。 -009989 晚上加完班开车回家,太累了,迷迷糊糊开着车,走一半的时候,铛一声! -009990 该车将三人撞倒后,在大雾中逃窜。 -009991 这人一哆嗦,方向盘也把不稳了,差点撞上了高速边道护栏。 -009992 那女孩儿委屈的说,我一回头见你已经进去了我不敢进去啊! -009993 小明摇摇头说,不是,我只是美女看多了,想换个口味而已。 -009994 接下来,红娘要求记者交费,记者表示不知表姐身份证号码。 -009995 李东蓊表示,自己当时在法庭上发表了一次独特的公诉意见。 -009996 另一男子扑了上来,手里拿着明晃晃的长刀,向他胸口直刺。 -009997 今天,快递员拿着一个快递在办公室喊,秦王是哪个,有他快递? -009998 这场抗议活动究竟是如何发展演变的,又究竟是谁伤害了谁? -009999 因华国锋肖鸡,墓地设计根据其属相设计。 -010000 在狱中,张明宝悔恨交加,写了一份忏悔书。 diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh index fdceec41..631daddd 100644 --- a/paddlespeech/server/tests/tts/infer/run.sh +++ b/paddlespeech/server/tests/tts/infer/run.sh @@ -1,14 +1,7 @@ -model_path=/home/users/liangyunming/.paddlespeech/models/ -#am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_nosil_baker_ckpt_0.4/ ## fastspeech2 -am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ ## fastspeech2_cnn -voc_model_dir=$model_path/hifigan_csmsc-zh/hifigan_csmsc_ckpt_0.1.1/ ## hifigan -#voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ ## mb_melgan - -if [[ $am_model_dir == *"fastspeech2_cnndecoder"* ]]; then - am_support_stream=True -else - am_support_stream=False -fi +model_path=~/.paddlespeech/models/ +am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ ## fastspeech2_c +voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ ## mb_melgan +testdata=../../../../t2s/exps/csmsc_test.txt # get am file for file in $(ls $am_model_dir) @@ -39,23 +32,24 @@ do done -#run -python test_online_tts.py --am fastspeech2_csmsc \ - --am_support_stream $am_support_stream \ +# run test +# am can choose fastspeech2_csmsc or fastspeech2-C_csmsc, where fastspeech2-C_csmsc supports streaming inference. +# voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference. +python test_online_tts.py --am fastspeech2-C_csmsc \ --am_config $am_model_dir/$am_config_file \ --am_ckpt $am_model_dir/$am_ckpt_file \ --am_stat $am_model_dir/$am_stat_file \ --phones_dict $am_model_dir/$phones_dict_file \ - --voc hifigan_csmsc \ + --voc mb_melgan_csmsc \ --voc_config $voc_model_dir/$voc_config_file \ --voc_ckpt $voc_model_dir/$voc_ckpt_file \ --voc_stat $voc_model_dir/$voc_stat_file \ --lang zh \ --device cpu \ - --text ./csmsc_test.txt \ + --text $testdata \ --output_dir ./output \ --log_file ./result.log \ - --am_streaming False \ + --am_streaming True \ --am_pad 12 \ --am_block 42 \ --voc_streaming True \ diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py index 17ac0ea7..8ccf724b 100644 --- a/paddlespeech/server/tests/tts/infer/test_online_tts.py +++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py @@ -71,8 +71,7 @@ def get_stream_am_inference(args, am_config): vocab_size = len(phn_id) print("vocab_size:", vocab_size) - am_name = args.am[:args.am.rindex('_')] - am_dataset = args.am[args.am.rindex('_') + 1:] + am_name = "fastspeech2" odim = am_config.n_mels am_class = dynamic_import(am_name, model_alias) @@ -100,7 +99,7 @@ def init(args): frontend = get_frontend(args) # acoustic model - if args.am_support_stream: + if args.am == 'fastspeech2-C_csmsc': am, am_mu, am_std = get_stream_am_inference(args, am_config) am_infer_info = [am, am_mu, am_std, am_config] else: @@ -117,8 +116,6 @@ def init(args): def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): am_name = args.am[:args.am.rindex('_')] tone_ids = None - if am_name == 'speedyspeech': - get_tone_ids = True if args.lang == 'zh': input_ids = frontend.get_input_ids( @@ -142,7 +139,7 @@ def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): # 生成完整的mel def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): # 如果是支持流式的AM模型 - if args.am_support_stream: + if args.am == 'fastspeech2-C_csmsc': am, am_mu, am_std, am_config = am_infer_info orig_hs, h_masks = am.encoder_infer(part_phone_ids) if args.am_streaming: @@ -180,23 +177,7 @@ def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): else: am_inference, am_name, am_dataset, am_config = am_infer_info - # acoustic model - if am_name == 'fastspeech2': - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(part_phone_ids, spk_id) - else: - mel = am_inference(part_phone_ids) - elif am_name == 'speedyspeech': - part_tone_ids = tone_ids[i] - if am_dataset in {"aishell3", "vctk"}: - spk_id = paddle.to_tensor(args.spk_id) - mel = am_inference(part_phone_ids, part_tone_ids, spk_id) - else: - mel = am_inference(part_phone_ids, part_tone_ids) - elif am_name == 'tacotron2': - mel = am_inference(part_phone_ids) + mel = am_inference(part_phone_ids) return mel @@ -297,7 +278,8 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, global wav_streaming global voc_stream_st mel_streaming = None - flag = 1 #用来表示开启流式voc的线程 + #用来表示开启流式voc的线程 + flag = 1 am, am_mu, am_std, am_config = am_infer_info orig_hs, h_masks = am.encoder_infer(part_phone_ids) @@ -343,7 +325,7 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav -def try_infer(args, logger, frontend, am_infer_info, voc_infer_info): +def warm_up(args, logger, frontend, am_infer_info, voc_infer_info): global sample_rate logger.info( "Before the formal test, we test a few texts to make the inference speed more stable." @@ -363,7 +345,7 @@ def try_infer(args, logger, frontend, am_infer_info, voc_infer_info): merge_sentences = True get_tone_ids = False - for i in range(3): # 推理3次 + for i in range(5): # 推理5次 st = time.time() phone_ids, tone_ids = get_phone(args, frontend, sentence, merge_sentences, get_tone_ids) @@ -500,18 +482,10 @@ def parse_args(): '--am', type=str, default='fastspeech2_csmsc', - choices=[ - 'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc', - 'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk', - 'tacotron2_csmsc', 'tacotron2_ljspeech' - ], - help='Choose acoustic model type of tts task.') - parser.add_argument( - '--am_support_stream', - type=str2bool, - default=False, - help='if am model is fastspeech2_csmsc, specify whether it supports streaming' + choices=['fastspeech2_csmsc', 'fastspeech2-C_csmsc'], + help='Choose acoustic model type of tts task. where fastspeech2-C_csmsc supports streaming inference' ) + parser.add_argument( '--am_config', type=str, @@ -532,23 +506,12 @@ def parse_args(): "--phones_dict", type=str, default=None, help="phone vocabulary file.") parser.add_argument( "--tones_dict", type=str, default=None, help="tone vocabulary file.") - parser.add_argument( - "--speaker_dict", type=str, default=None, help="speaker id map file.") - parser.add_argument( - '--spk_id', - type=int, - default=0, - help='spk id for multi speaker acoustic model') # vocoder parser.add_argument( '--voc', type=str, default='mb_melgan_csmsc', - choices=[ - 'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk', - 'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc', - 'wavernn_csmsc' - ], + choices=['mb_melgan_csmsc', 'hifigan_csmsc'], help='Choose vocoder type of tts task.') parser.add_argument( '--voc_config', @@ -612,12 +575,8 @@ def parse_args(): def main(): args = parse_args() paddle.set_device(args.device) - if args.am_support_stream: - assert (args.am == 'fastspeech2_csmsc') if args.am_streaming: - assert (args.am_support_stream and args.am == 'fastspeech2_csmsc') - if args.voc_streaming: - assert (args.voc == 'mb_melgan_csmsc' or args.voc == 'hifigan_csmsc') + assert (args.am == 'fastspeech2-C_csmsc') logger = logging.getLogger() fhandler = logging.FileHandler(filename=args.log_file, mode='w') @@ -639,8 +598,8 @@ def main(): # get information about model frontend, am_infer_info, voc_infer_info = init(args) logger.info( - "************************ try infer *********************************") - try_infer(args, logger, frontend, am_infer_info, voc_infer_info) + "************************ warm up *********************************") + warm_up(args, logger, frontend, am_infer_info, voc_infer_info) logger.info( "************************ normal test *******************************") evaluate(args, logger, frontend, am_infer_info, voc_infer_info) From 9d0224460bec81139fd7d69732dce0f7c7ec36fa Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 11 Apr 2022 15:54:44 +0800 Subject: [PATCH 06/18] code format, test=doc --- paddlespeech/server/tests/tts/infer/run.sh | 12 ++-- .../server/tests/tts/infer/test_online_tts.py | 67 ++++++++++--------- 2 files changed, 42 insertions(+), 37 deletions(-) diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh index 631daddd..3733c3fb 100644 --- a/paddlespeech/server/tests/tts/infer/run.sh +++ b/paddlespeech/server/tests/tts/infer/run.sh @@ -1,6 +1,6 @@ model_path=~/.paddlespeech/models/ -am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ ## fastspeech2_c -voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ ## mb_melgan +am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ +voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ testdata=../../../../t2s/exps/csmsc_test.txt # get am file @@ -33,9 +33,13 @@ done # run test -# am can choose fastspeech2_csmsc or fastspeech2-C_csmsc, where fastspeech2-C_csmsc supports streaming inference. +# am can choose fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc, where fastspeech2_cnndecoder_csmsc supports streaming inference. # voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference. -python test_online_tts.py --am fastspeech2-C_csmsc \ +# When am is fastspeech2_cnndecoder_csmsc and am_pad is set to 12, there is no diff between streaming and non-streaming inference results. +# When voc is mb_melgan_csmsc and voc_pad is set to 14, there is no diff between streaming and non-streaming inference results. +# When voc is hifigan_csmsc and voc_pad is set to 20, there is no diff between streaming and non-streaming inference results. + +python test_online_tts.py --am fastspeech2_cnndecoder_csmsc \ --am_config $am_model_dir/$am_config_file \ --am_ckpt $am_model_dir/$am_ckpt_file \ --am_stat $am_model_dir/$am_stat_file \ diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py index 8ccf724b..eb5fc80b 100644 --- a/paddlespeech/server/tests/tts/infer/test_online_tts.py +++ b/paddlespeech/server/tests/tts/infer/test_online_tts.py @@ -34,8 +34,8 @@ from paddlespeech.t2s.utils import str2bool mel_streaming = None wav_streaming = None -stream_first_time = 0.0 -voc_stream_st = 0.0 +streaming_first_time = 0.0 +streaming_voc_st = 0.0 sample_rate = 0 @@ -65,7 +65,7 @@ def get_chunks(data, block_size, pad_size, step): return chunks -def get_stream_am_inference(args, am_config): +def get_streaming_am_inference(args, am_config): with open(args.phones_dict, "r") as f: phn_id = [line.strip().split() for line in f.readlines()] vocab_size = len(phn_id) @@ -99,8 +99,8 @@ def init(args): frontend = get_frontend(args) # acoustic model - if args.am == 'fastspeech2-C_csmsc': - am, am_mu, am_std = get_stream_am_inference(args, am_config) + if args.am == 'fastspeech2_cnndecoder_csmsc': + am, am_mu, am_std = get_streaming_am_inference(args, am_config) am_infer_info = [am, am_mu, am_std, am_config] else: am_inference, am_name, am_dataset = get_am_inference(args, am_config) @@ -139,7 +139,7 @@ def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): # 生成完整的mel def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): # 如果是支持流式的AM模型 - if args.am == 'fastspeech2-C_csmsc': + if args.am == 'fastspeech2_cnndecoder_csmsc': am, am_mu, am_std, am_config = am_infer_info orig_hs, h_masks = am.encoder_infer(part_phone_ids) if args.am_streaming: @@ -183,9 +183,9 @@ def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): @paddle.no_grad() -def stream_voc_infer(args, voc_infer_info, mel_len): +def streaming_voc_infer(args, voc_infer_info, mel_len): global mel_streaming - global stream_first_time + global streaming_first_time global wav_streaming voc_inference, voc_config = voc_infer_info block = args.voc_block @@ -203,7 +203,7 @@ def stream_voc_infer(args, voc_infer_info, mel_len): while valid_end <= mel_len: sub_wav = voc_inference(mel_chunk) if flag == 1: - stream_first_time = time.time() + streaming_first_time = time.time() flag = 0 # get valid wav @@ -233,8 +233,8 @@ def stream_voc_infer(args, voc_infer_info, mel_len): @paddle.no_grad() # 非流式AM / 流式AM + 非流式Voc -def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, - part_tone_ids): +def am_nonstreaming_voc(args, am_infer_info, voc_infer_info, part_phone_ids, + part_tone_ids): mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) am_infer_time = time.time() voc_inference, voc_config = voc_infer_info @@ -248,10 +248,10 @@ def am_nostream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, @paddle.no_grad() # 非流式AM + 流式Voc -def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, - part_tone_ids): +def nonstreaming_am_streaming_voc(args, am_infer_info, voc_infer_info, + part_phone_ids, part_tone_ids): global mel_streaming - global stream_first_time + global streaming_first_time global wav_streaming mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) @@ -260,8 +260,8 @@ def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, # voc streaming mel_streaming = mel mel_len = mel.shape[0] - stream_voc_infer(args, voc_infer_info, mel_len) - first_response_time = stream_first_time + streaming_voc_infer(args, voc_infer_info, mel_len) + first_response_time = streaming_first_time wav = wav_streaming final_response_time = time.time() voc_infer_time = final_response_time @@ -271,12 +271,12 @@ def nostream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, @paddle.no_grad() # 流式AM + 流式 Voc -def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, - part_tone_ids): +def streaming_am_streaming_voc(args, am_infer_info, voc_infer_info, + part_phone_ids, part_tone_ids): global mel_streaming - global stream_first_time + global streaming_first_time global wav_streaming - global voc_stream_st + global streaming_voc_st mel_streaming = None #用来表示开启流式voc的线程 flag = 1 @@ -311,15 +311,16 @@ def stream_am_stream_voc(args, am_infer_info, voc_infer_info, part_phone_ids, if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad: t = threading.Thread( - target=stream_voc_infer, args=(args, voc_infer_info, mel_len, )) + target=streaming_voc_infer, + args=(args, voc_infer_info, mel_len, )) t.start() - voc_stream_st = time.time() + streaming_voc_st = time.time() flag = 0 t.join() final_response_time = time.time() voc_infer_time = final_response_time - first_response_time = stream_first_time + first_response_time = streaming_first_time wav = wav_streaming return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav @@ -337,11 +338,11 @@ def warm_up(args, logger, frontend, am_infer_info, voc_infer_info): if args.voc_streaming: if args.am_streaming: - infer_func = stream_am_stream_voc + infer_func = streaming_am_streaming_voc else: - infer_func = nostream_am_stream_voc + infer_func = nonstreaming_am_streaming_voc else: - infer_func = am_nostream_voc + infer_func = am_nonstreaming_voc merge_sentences = True get_tone_ids = False @@ -376,11 +377,11 @@ def evaluate(args, logger, frontend, am_infer_info, voc_infer_info): # choose infer function if args.voc_streaming: if args.am_streaming: - infer_func = stream_am_stream_voc + infer_func = streaming_am_streaming_voc else: - infer_func = nostream_am_stream_voc + infer_func = nonstreaming_am_streaming_voc else: - infer_func = am_nostream_voc + infer_func = am_nonstreaming_voc final_up_duration = 0.0 sentence_count = 0 @@ -410,7 +411,7 @@ def evaluate(args, logger, frontend, am_infer_info, voc_infer_info): args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) am_time = am_infer_time - am_st if args.voc_streaming and args.am_streaming: - voc_time = voc_infer_time - voc_stream_st + voc_time = voc_infer_time - streaming_voc_st else: voc_time = voc_infer_time - am_infer_time @@ -482,8 +483,8 @@ def parse_args(): '--am', type=str, default='fastspeech2_csmsc', - choices=['fastspeech2_csmsc', 'fastspeech2-C_csmsc'], - help='Choose acoustic model type of tts task. where fastspeech2-C_csmsc supports streaming inference' + choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'], + help='Choose acoustic model type of tts task. where fastspeech2_cnndecoder_csmsc supports streaming inference' ) parser.add_argument( @@ -576,7 +577,7 @@ def main(): args = parse_args() paddle.set_device(args.device) if args.am_streaming: - assert (args.am == 'fastspeech2-C_csmsc') + assert (args.am == 'fastspeech2_cnndecoder_csmsc') logger = logging.getLogger() fhandler = logging.FileHandler(filename=args.log_file, mode='w') From 37d9c08da565f96d96dddcaf0f7aa91fa2ae6f8f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 16 Apr 2022 04:04:39 +0000 Subject: [PATCH 07/18] fix utils for ngram and wfst --- .gitignore | 6 + speechx/examples/build_wfst/path.sh | 27 +++ speechx/examples/build_wfst/run.sh | 64 +++++++ speechx/examples/ds2_ol/aishell/README.md | 10 +- speechx/examples/ds2_ol/aishell/path.sh | 2 +- speechx/examples/ds2_ol/aishell/run.sh | 177 ++++++++++-------- .../examples/ngram/local/aishell_train_lms.sh | 57 ++++++ speechx/examples/ngram/path.sh | 20 ++ speechx/examples/ngram/run.sh | 61 ++++++ speechx/examples/ngram/utils | 1 + speechx/tools/install_srilm.sh | 97 ---------- tools/Makefile | 11 +- tools/extras/install_openfst.sh | 3 +- utils/espnet_json_to_manifest.py | 0 utils/generate_infer_yaml.py | 0 utils/link_wav.py | 0 utils/manifest_key_value.py | 33 +++- 17 files changed, 374 insertions(+), 195 deletions(-) create mode 100644 speechx/examples/build_wfst/path.sh create mode 100644 speechx/examples/build_wfst/run.sh create mode 100644 speechx/examples/ngram/local/aishell_train_lms.sh create mode 100644 speechx/examples/ngram/path.sh create mode 100644 speechx/examples/ngram/run.sh create mode 120000 speechx/examples/ngram/utils delete mode 100755 speechx/tools/install_srilm.sh mode change 100644 => 100755 utils/espnet_json_to_manifest.py mode change 100644 => 100755 utils/generate_infer_yaml.py mode change 100644 => 100755 utils/link_wav.py diff --git a/.gitignore b/.gitignore index 63947200..7328b329 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,12 @@ tools/Miniconda3-latest-Linux-x86_64.sh tools/activate_python.sh tools/miniconda.sh tools/CRF++-0.58/ +tools/liblbfgs-1.10/ +tools/srilm/ +tools/env.sh +tools/openfst-1.8.1/ +tools/libsndfile/ +tools/python-soundfile/ speechx/fc_patch/ diff --git a/speechx/examples/build_wfst/path.sh b/speechx/examples/build_wfst/path.sh new file mode 100644 index 00000000..e4008cd2 --- /dev/null +++ b/speechx/examples/build_wfst/path.sh @@ -0,0 +1,27 @@ +# This contains the locations of binarys build required for running the examples. + +SPEECHX_ROOT=$PWD/../../../ +MAIN_ROOT=$SPEECHX_ROOT/../ +SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +export PATH=$PATH:$TOOLS_BIN + +# srilm +export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs +export SRILM=${MAIN_ROOT}/tools/srilm +export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 + +# Kaldi +export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!" +[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh diff --git a/speechx/examples/build_wfst/run.sh b/speechx/examples/build_wfst/run.sh new file mode 100644 index 00000000..bba14c59 --- /dev/null +++ b/speechx/examples/build_wfst/run.sh @@ -0,0 +1,64 @@ +#!/bin/bash +set -eo pipefail + +. path.sh + +stage=-1 +stop_stage=100 +corpus=aishell +lmtype=srilm + +lexicon= # aishell/resource_aishell/lexicon.txt +text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt + +source parse_options.sh + +if [ ! which ngram-count ]; then + pushd $MAIN_ROOT/tools + make srilm.done + popd +fi + +if [ ! which fstprint ]; then + pushd $MAIN_ROOT/tools + make kaldi.done + popd +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # 7.1 Prepare dict + unit_file=data/vocab.txt + mkdir -p data/local/dict + cp $unit_file data/local/dict/units.txt + utils/fst/prepare_dict.py \ + --unit_file $unit_file \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # 7.2 Train lm + lm=data/local/lm + mkdir -p data/train + mkdir -p $lm + utils/manifest_key_value.py \ + --manifest_path data/manifest.train \ + --output_path data/train + utils/filter_scp.pl data/train/text \ + $text > $lm/text + if [ $lmtype == 'srilm' ];then + local/aishell_train_lms.sh + else + utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa + fi +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + # 7.3 Build decoding TLG + utils/fst/compile_lexicon_token_fst.sh \ + data/local/dict data/local/tmp data/local/lang + utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; +fi + +echo "Aishell build TLG done." +exit 0 diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md index eec67c3b..1ee73a33 100644 --- a/speechx/examples/ds2_ol/aishell/README.md +++ b/speechx/examples/ds2_ol/aishell/README.md @@ -10,12 +10,18 @@ Other -> 0.00 % N=0 C=0 S=0 D=0 I=0 ## CTC Prefix Beam Search w LM +LM: zh_giga.no_cna_cmn.prune01244.klm ``` - +Overall -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327 +Mandarin -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327 +Other -> 0.00 % N=0 C=0 S=0 D=0 I=0 ``` ## CTC WFST +LM: aishell train ``` - +Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819 +Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818 +Other -> 0.00 % N=0 C=0 S=0 D=0 I=1 ``` \ No newline at end of file diff --git a/speechx/examples/ds2_ol/aishell/path.sh b/speechx/examples/ds2_ol/aishell/path.sh index 8e26e6e7..0a300f36 100644 --- a/speechx/examples/ds2_ol/aishell/path.sh +++ b/speechx/examples/ds2_ol/aishell/path.sh @@ -11,4 +11,4 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin export LC_AL=C SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat -export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN +export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN \ No newline at end of file diff --git a/speechx/examples/ds2_ol/aishell/run.sh b/speechx/examples/ds2_ol/aishell/run.sh index 3a1c19ee..6a59ca9b 100755 --- a/speechx/examples/ds2_ol/aishell/run.sh +++ b/speechx/examples/ds2_ol/aishell/run.sh @@ -5,7 +5,10 @@ set -e . path.sh nj=40 +stage=0 +stop_stage=100 +. utils/parse_options.sh # 1. compile if [ ! -d ${SPEECHX_EXAMPLES} ]; then @@ -26,102 +29,112 @@ vocb_dir=$ckpt_dir/data/lang_char/ mkdir -p exp exp=$PWD/exp -aishell_wav_scp=aishell_test.scp -if [ ! -d $data/test ]; then - pushd $data - wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip - unzip aishell_test.zip - popd - - realpath $data/test/*/*.wav > $data/wavlist - awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id - paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp -fi - - -if [ ! -d $ckpt_dir ]; then - mkdir -p $ckpt_dir - wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz - tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir -fi - -lm=$data/zh_giga.no_cna_cmn.prune01244.klm -if [ ! -f $lm ]; then - pushd $data - wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm - popd +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then + aishell_wav_scp=aishell_test.scp + if [ ! -d $data/test ]; then + pushd $data + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip + unzip aishell_test.zip + popd + + realpath $data/test/*/*.wav > $data/wavlist + awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id + paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp + fi + + + if [ ! -d $ckpt_dir ]; then + mkdir -p $ckpt_dir + wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz + tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir + fi + + lm=$data/zh_giga.no_cna_cmn.prune01244.klm + if [ ! -f $lm ]; then + pushd $data + wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm + popd + fi fi # 3. make feature +text=$data/test/text label_file=./aishell_result wer=./aishell_wer export GLOG_logtostderr=1 -# 3. gen linear feat -cmvn=$PWD/cmvn.ark -cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then + # 3. gen linear feat + cmvn=$data/cmvn.ark + cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn -./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj + ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj -utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \ -linear-spectrogram-wo-db-norm-ol \ - --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ - --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \ - --cmvn_file=$cmvn \ - --streaming_chunk=0.36 - -text=$data/test/text + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \ + linear-spectrogram-wo-db-norm-ol \ + --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ + --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \ + --cmvn_file=$cmvn \ + --streaming_chunk=0.36 +fi -# 4. recognizer -utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \ - ctc-prefix-beam-search-decoder-ol \ - --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ - --model_path=$model_dir/avg_1.jit.pdmodel \ - --param_path=$model_dir/avg_1.jit.pdiparams \ - --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --dict_file=$vocb_dir/vocab.txt \ - --result_wspecifier=ark,t:$data/split${nj}/JOB/result - -cat $data/split${nj}/*/result > ${label_file} -utils/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer} - -# 4. decode with lm -utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \ - ctc-prefix-beam-search-decoder-ol \ - --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ - --model_path=$model_dir/avg_1.jit.pdmodel \ - --param_path=$model_dir/avg_1.jit.pdiparams \ - --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --dict_file=$vocb_dir/vocab.txt \ - --lm_path=$lm \ - --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm - - -cat $data/split${nj}/*/result_lm > ${label_file}_lm -utils/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm - - -graph_dir=./aishell_graph -if [ ! -d $ ]; then - wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip - unzip -d aishell_graph.zip +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then + # recognizer + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \ + ctc-prefix-beam-search-decoder-ol \ + --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ + --model_path=$model_dir/avg_1.jit.pdmodel \ + --param_path=$model_dir/avg_1.jit.pdiparams \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --dict_file=$vocb_dir/vocab.txt \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result + + cat $data/split${nj}/*/result > $exp/${label_file} + utils/compute-wer.py --char=1 --v=1 $exp/${label_file} $text > $exp/${wer} fi +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then + # decode with lm + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \ + ctc-prefix-beam-search-decoder-ol \ + --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ + --model_path=$model_dir/avg_1.jit.pdmodel \ + --param_path=$model_dir/avg_1.jit.pdiparams \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --dict_file=$vocb_dir/vocab.txt \ + --lm_path=$lm \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm + + cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm + utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_lm $text > $exp/${wer}_lm +fi -# 5. test TLG decoder -utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \ - wfst-decoder-ol \ - --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ - --model_path=$model_dir/avg_1.jit.pdmodel \ - --param_path=$model_dir/avg_1.jit.pdiparams \ - --word_symbol_table=$graph_dir/words.txt \ - --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ - --graph_path=$graph_dir/TLG.fst --max_active=7500 \ - --acoustic_scale=1.2 \ - --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg +wfst=$data/wfst/ +mkdir -p $wfst +if [ ! -f $wfst/aishell_graph.zip ]; then + pushd $wfst + wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip + unzip aishell_graph.zip + popd +fi -cat $data/split${nj}/*/result_tlg > ${label_file}_tlg -utils/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg \ No newline at end of file +graph_dir=$wfst/aishell_graph +if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then + # TLG decoder + utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \ + wfst-decoder-ol \ + --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ + --model_path=$model_dir/avg_1.jit.pdmodel \ + --param_path=$model_dir/avg_1.jit.pdiparams \ + --word_symbol_table=$graph_dir/words.txt \ + --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ + --graph_path=$graph_dir/TLG.fst --max_active=7500 \ + --acoustic_scale=1.2 \ + --result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg + + cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg + utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_tlg $text > $exp/${wer}_tlg +fi \ No newline at end of file diff --git a/speechx/examples/ngram/local/aishell_train_lms.sh b/speechx/examples/ngram/local/aishell_train_lms.sh new file mode 100644 index 00000000..d9f87aca --- /dev/null +++ b/speechx/examples/ngram/local/aishell_train_lms.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# To be run from one directory above this script. +. ./path.sh + +text=data/local/lm/text +lexicon=data/local/dict/lexicon.txt + +for f in "$text" "$lexicon"; do + [ ! -f $x ] && echo "$0: No such file $f" && exit 1; +done + +# Check SRILM tools +if ! which ngram-count > /dev/null; then + echo "srilm tools are not found, please download it and install it from: " + echo "http://www.speech.sri.com/projects/srilm/download.html" + echo "Then add the tools to your PATH" + exit 1 +fi + +# This script takes no arguments. It assumes you have already run +# aishell_data_prep.sh. +# It takes as input the files +# data/local/lm/text +# data/local/dict/lexicon.txt +dir=data/local/lm +mkdir -p $dir + +cleantext=$dir/text.no_oov + +cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ + > $cleantext || exit 1; + +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ + sort -nr > $dir/word.counts || exit 1; + +# Get counts from acoustic training transcripts, and add one-count +# for each word in the lexicon (but not silence, we don't want it +# in the LM-- we'll add it optionally later). +cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ + cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ + sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; + +cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist + +heldout_sent=10000 # Don't change this if you want result to be comparable with + # kaldi_lm results +mkdir -p $dir +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout +cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train + +ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ + -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa +ngram -lm $dir/lm.arpa -ppl $dir/heldout \ No newline at end of file diff --git a/speechx/examples/ngram/path.sh b/speechx/examples/ngram/path.sh new file mode 100644 index 00000000..f926ccd2 --- /dev/null +++ b/speechx/examples/ngram/path.sh @@ -0,0 +1,20 @@ +# This contains the locations of binarys build required for running the examples. + +SPEECHX_ROOT=$PWD/../../../ +MAIN_ROOT=$SPEECHX_ROOT/../ +SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples + +SPEECHX_TOOLS=$SPEECHX_ROOT/tools +TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin + +[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } + +export LC_AL=C + +export PATH=$PATH:$TOOLS_BIN + +# srilm +export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs +export SRILM=${MAIN_ROOT}/tools/srilm +export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 \ No newline at end of file diff --git a/speechx/examples/ngram/run.sh b/speechx/examples/ngram/run.sh new file mode 100644 index 00000000..462a8955 --- /dev/null +++ b/speechx/examples/ngram/run.sh @@ -0,0 +1,61 @@ +#!/bin/bash +set -eo pipefail + +. path.sh + +stage=-1 +stop_stage=100 +corpus=aishell + +unit=data/vocab.txt # vocab +lexicon= # aishell/resource_aishell/lexicon.txt +text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt + +. parse_options.sh + +data=$PWD/data +mkdir -p $data + +if [ ! -f $unit ]; then + echo "$0: No such file $unit" + exit 1; +fi + +if [ ! which ngram-count ]; then + pushd $MAIN_ROOT/tools + make srilm.done + popd +fi + +if [ ! which fstaddselfloops ]; then + pushd $MAIN_ROOT/tools + make kaldi.done + popd +fi + +mkdir -p data/local/dict +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # 7.1 Prepare dict + cp $unit data/local/dict/units.txt + utils/fst/prepare_dict.py \ + --unit_file $unit \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt +fi + +lm=data/local/lm +mkdir -p data/train +mkdir -p $lm +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # 7.2 Train lm + utils/manifest_key_value.py \ + --manifest_path data/manifest.train \ + --output_path data/train + utils/filter_scp.pl data/train/text \ + $text > $lm/text + + local/aishell_train_lms.sh +fi + +echo "build LM done." +exit 0 diff --git a/speechx/examples/ngram/utils b/speechx/examples/ngram/utils new file mode 120000 index 00000000..256f914a --- /dev/null +++ b/speechx/examples/ngram/utils @@ -0,0 +1 @@ +../../../utils/ \ No newline at end of file diff --git a/speechx/tools/install_srilm.sh b/speechx/tools/install_srilm.sh deleted file mode 100755 index 813109db..00000000 --- a/speechx/tools/install_srilm.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env bash - -current_path=`pwd` -current_dir=`basename "$current_path"` - -if [ "tools" != "$current_dir" ]; then - echo "You should run this script in tools/ directory!!" - exit 1 -fi - -if [ ! -d liblbfgs-1.10 ]; then - echo Installing libLBFGS library to support MaxEnt LMs - bash extras/install_liblbfgs.sh || exit 1 -fi - -! command -v gawk > /dev/null && \ - echo "GNU awk is not installed so SRILM will probably not work correctly: refusing to install" && exit 1; - -if [ $# -ne 3 ]; then - echo "SRILM download requires some information about you" - echo - echo "Usage: $0 " - exit 1 -fi - -srilm_url="http://www.speech.sri.com/projects/srilm/srilm_download.php" -post_data="WWW_file=srilm-1.7.3.tar.gz&WWW_name=$1&WWW_org=$2&WWW_email=$3" - -if ! wget --post-data "$post_data" -O ./srilm.tar.gz "$srilm_url"; then - echo 'There was a problem downloading the file.' - echo 'Check you internet connection and try again.' - exit 1 -fi - -mkdir -p srilm -cd srilm - - -if [ -f ../srilm.tgz ]; then - tar -xvzf ../srilm.tgz # Old SRILM format -elif [ -f ../srilm.tar.gz ]; then - tar -xvzf ../srilm.tar.gz # Changed format type from tgz to tar.gz -fi - -major=`gawk -F. '{ print $1 }' RELEASE` -minor=`gawk -F. '{ print $2 }' RELEASE` -micro=`gawk -F. '{ print $3 }' RELEASE` - -if [ $major -le 1 ] && [ $minor -le 7 ] && [ $micro -le 1 ]; then - echo "Detected version 1.7.1 or earlier. Applying patch." - patch -p0 < ../extras/srilm.patch -fi - -# set the SRILM variable in the top-level Makefile to this directory. -cp Makefile tmpf - -cat tmpf | gawk -v pwd=`pwd` '/SRILM =/{printf("SRILM = %s\n", pwd); next;} {print;}' \ - > Makefile || exit 1 -rm tmpf - -mtype=`sbin/machine-type` - -echo HAVE_LIBLBFGS=1 >> common/Makefile.machine.$mtype -grep ADDITIONAL_INCLUDES common/Makefile.machine.$mtype | \ - sed 's|$| -I$(SRILM)/../liblbfgs-1.10/include|' \ - >> common/Makefile.machine.$mtype - -grep ADDITIONAL_LDFLAGS common/Makefile.machine.$mtype | \ - sed 's|$| -L$(SRILM)/../liblbfgs-1.10/lib/ -Wl,-rpath -Wl,$(SRILM)/../liblbfgs-1.10/lib/|' \ - >> common/Makefile.machine.$mtype - -make || exit - -cd .. -( - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM variable is aleady defined. Undefining..." && \ - unset SRILM - - [ -f ./env.sh ] && . ./env.sh - - [ ! -z "${SRILM}" ] && \ - echo >&2 "SRILM config is already in env.sh" && exit - - wd=`pwd` - wd=`readlink -f $wd || pwd` - - echo "export SRILM=$wd/srilm" - dirs="\${PATH}" - for directory in $(cd srilm && find bin -type d ) ; do - dirs="$dirs:\${SRILM}/$directory" - done - echo "export PATH=$dirs" -) >> env.sh - -echo >&2 "Installation of SRILM finished successfully" -echo >&2 "Please source the tools/env.sh in your path.sh to enable it" diff --git a/tools/Makefile b/tools/Makefile index 285f85c8..a5a4485d 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -25,7 +25,7 @@ clean: apt.done: apt update -y - apt install -y bc flac jq vim tig tree pkg-config libsndfile1 libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev + apt install -y bc flac jq vim tig tree sox pkg-config libsndfile1 libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev echo "check_certificate = off" >> ~/.wgetrc touch apt.done @@ -50,7 +50,7 @@ openblas.done: bash extras/install_openblas.sh touch openblas.done -kaldi.done: openblas.done +kaldi.done: apt.done openblas.done bash extras/install_kaldi.sh touch kaldi.done @@ -58,6 +58,11 @@ sctk.done: ./extras/install_sclite.sh touch sctk.done +srilm.done: + ./extras/install_liblbfgs.sh + extras/install_srilm.sh + touch srilm.done + ###################### dev: python conda_packages.done sctk.done @@ -96,4 +101,4 @@ conda_packages.done: bc.done cmake.done flac.done ffmpeg.done sox.done sndfile.d else conda_packages.done: endif - touch conda_packages.done \ No newline at end of file + touch conda_packages.done diff --git a/tools/extras/install_openfst.sh b/tools/extras/install_openfst.sh index 54ddef6a..5e97bc81 100755 --- a/tools/extras/install_openfst.sh +++ b/tools/extras/install_openfst.sh @@ -7,8 +7,9 @@ set -x # openfst openfst=openfst-1.8.1 shared=true +WGET="wget -c --no-check-certificate" -test -e ${openfst}.tar.gz || wget http://www.openfst.org/twiki/pub/FST/FstDownload/${openfst}.tar.gz +test -e ${openfst}.tar.gz || $WGET http://www.openfst.org/twiki/pub/FST/FstDownload/${openfst}.tar.gz test -d ${openfst} || tar -xvf ${openfst}.tar.gz && chown -R root:root ${openfst} diff --git a/utils/espnet_json_to_manifest.py b/utils/espnet_json_to_manifest.py old mode 100644 new mode 100755 diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py old mode 100644 new mode 100755 diff --git a/utils/link_wav.py b/utils/link_wav.py old mode 100644 new mode 100755 diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py index fb3d3aaa..0ab3ae08 100755 --- a/utils/manifest_key_value.py +++ b/utils/manifest_key_value.py @@ -26,23 +26,38 @@ def main(args): with wav_scp.open('w') as fwav, dur_scp.open('w') as fdur, text_scp.open( 'w') as ftxt: for line_json in manifest_jsons: + # utt:str + # utt2spk:str + # input: [{name:str, shape:[dur_in_sec, feat_dim], feat:str, filetype:str}, ] + # output: [{name:str, shape:[tokenlen, vocab_dim], text:str, token:str, tokenid:str}, ] utt = line_json['utt'] - feat = line_json['feat'] + utt2spk = line_json['utt2spk'] + + # input + assert(len(line_json['input']) == 1), "only support one input now" + input_json = line_json['input'][0] + feat = input_json['feat'] + feat_shape = input_json['shape'] + file_type = input_json['filetype'] + file_ext = Path(feat).suffix # .wav - text = line_json['text'] - feat_shape = line_json['feat_shape'] dur = feat_shape[0] feat_dim = feat_shape[1] - if 'token' in line_json: - tokens = line_json['token'] - tokenids = line_json['token_id'] - token_shape = line_json['token_shape'] - token_len = token_shape[0] - vocab_dim = token_shape[1] if file_ext == '.wav': fwav.write(f"{utt} {feat}\n") fdur.write(f"{utt} {dur}\n") + + # output + assert(len(line_json['output']) == 1), "only support one output now" + output_json = line_json['output'][0] + text = output_json['text'] + if 'token' in output_json: + tokens = output_json['token'] + tokenids = output_json['tokenid'] + token_shape = output_json['shape'] + token_len = token_shape[0] + vocab_dim = token_shape[1] ftxt.write(f"{utt} {text}\n") count += 1 From eb52896c4a1b7db12072a11d481a7a0260a2492f Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 16 Apr 2022 06:48:28 +0000 Subject: [PATCH 08/18] zh ngram build --- speechx/examples/README.md | 18 +++- speechx/examples/ngram/README.md | 1 - speechx/examples/ngram/en/README.md | 0 speechx/examples/ngram/path.sh | 20 ---- speechx/examples/ngram/run.sh | 61 ----------- speechx/examples/ngram/utils | 1 - speechx/examples/ngram/zh/README.md | 101 ++++++++++++++++++ .../ngram/{ => zh}/local/aishell_train_lms.sh | 7 ++ speechx/examples/ngram/zh/path.sh | 12 +++ speechx/examples/ngram/zh/run.sh | 62 +++++++++++ speechx/examples/ngram/zh/utils | 1 + speechx/examples/text_lm/.gitignore | 1 + speechx/examples/text_lm/path.sh | 0 speechx/examples/text_lm/run.sh | 0 speechx/examples/wfst/README.md | 18 ++++ speechx/examples/{build_wfst => wfst}/path.sh | 12 +-- speechx/examples/{build_wfst => wfst}/run.sh | 6 -- 17 files changed, 217 insertions(+), 104 deletions(-) delete mode 100644 speechx/examples/ngram/README.md create mode 100644 speechx/examples/ngram/en/README.md delete mode 100644 speechx/examples/ngram/path.sh delete mode 100644 speechx/examples/ngram/run.sh delete mode 120000 speechx/examples/ngram/utils create mode 100644 speechx/examples/ngram/zh/README.md rename speechx/examples/ngram/{ => zh}/local/aishell_train_lms.sh (90%) mode change 100644 => 100755 create mode 100644 speechx/examples/ngram/zh/path.sh create mode 100755 speechx/examples/ngram/zh/run.sh create mode 120000 speechx/examples/ngram/zh/utils create mode 100644 speechx/examples/text_lm/.gitignore create mode 100644 speechx/examples/text_lm/path.sh create mode 100644 speechx/examples/text_lm/run.sh create mode 100644 speechx/examples/wfst/README.md rename speechx/examples/{build_wfst => wfst}/path.sh (68%) rename speechx/examples/{build_wfst => wfst}/run.sh (93%) diff --git a/speechx/examples/README.md b/speechx/examples/README.md index 35174a0d..c3de0d3a 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -1,12 +1,10 @@ # Examples for SpeechX -* dev - for speechx developer, using for test. -* ngram - using to build NGram ARPA lm. * ds2_ol - ds2 streaming test under `aishell-1` test dataset. - The entrypoint is `ds2_ol/aishell/run.sh` + The entrypoint is `ds2_ol/aishell/run.sh` -## How to run +## How to run `run.sh` is the entry point. @@ -17,9 +15,19 @@ pushd ds2_ol/aishell bash run.sh ``` -## Display Model with [Netron](https://github.com/lutzroeder/netron) +## Display Model with [Netron](https://github.com/lutzroeder/netron) ``` pip install netron netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20 ``` + +## Build WFST + +* text_lm - process text for build lm +* ngram - using to build NGram ARPA lm. +* wfst - build wfst for TLG. + +## For Developer + +* dev - for speechx developer, using for test. diff --git a/speechx/examples/ngram/README.md b/speechx/examples/ngram/README.md deleted file mode 100644 index b120715f..00000000 --- a/speechx/examples/ngram/README.md +++ /dev/null @@ -1 +0,0 @@ -# NGram Train diff --git a/speechx/examples/ngram/en/README.md b/speechx/examples/ngram/en/README.md new file mode 100644 index 00000000..e69de29b diff --git a/speechx/examples/ngram/path.sh b/speechx/examples/ngram/path.sh deleted file mode 100644 index f926ccd2..00000000 --- a/speechx/examples/ngram/path.sh +++ /dev/null @@ -1,20 +0,0 @@ -# This contains the locations of binarys build required for running the examples. - -SPEECHX_ROOT=$PWD/../../../ -MAIN_ROOT=$SPEECHX_ROOT/../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples - -SPEECHX_TOOLS=$SPEECHX_ROOT/tools -TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin - -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } - -export LC_AL=C - -export PATH=$PATH:$TOOLS_BIN - -# srilm -export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs -export SRILM=${MAIN_ROOT}/tools/srilm -export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 \ No newline at end of file diff --git a/speechx/examples/ngram/run.sh b/speechx/examples/ngram/run.sh deleted file mode 100644 index 462a8955..00000000 --- a/speechx/examples/ngram/run.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/bash -set -eo pipefail - -. path.sh - -stage=-1 -stop_stage=100 -corpus=aishell - -unit=data/vocab.txt # vocab -lexicon= # aishell/resource_aishell/lexicon.txt -text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt - -. parse_options.sh - -data=$PWD/data -mkdir -p $data - -if [ ! -f $unit ]; then - echo "$0: No such file $unit" - exit 1; -fi - -if [ ! which ngram-count ]; then - pushd $MAIN_ROOT/tools - make srilm.done - popd -fi - -if [ ! which fstaddselfloops ]; then - pushd $MAIN_ROOT/tools - make kaldi.done - popd -fi - -mkdir -p data/local/dict -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # 7.1 Prepare dict - cp $unit data/local/dict/units.txt - utils/fst/prepare_dict.py \ - --unit_file $unit \ - --in_lexicon ${lexicon} \ - --out_lexicon data/local/dict/lexicon.txt -fi - -lm=data/local/lm -mkdir -p data/train -mkdir -p $lm -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # 7.2 Train lm - utils/manifest_key_value.py \ - --manifest_path data/manifest.train \ - --output_path data/train - utils/filter_scp.pl data/train/text \ - $text > $lm/text - - local/aishell_train_lms.sh -fi - -echo "build LM done." -exit 0 diff --git a/speechx/examples/ngram/utils b/speechx/examples/ngram/utils deleted file mode 120000 index 256f914a..00000000 --- a/speechx/examples/ngram/utils +++ /dev/null @@ -1 +0,0 @@ -../../../utils/ \ No newline at end of file diff --git a/speechx/examples/ngram/zh/README.md b/speechx/examples/ngram/zh/README.md new file mode 100644 index 00000000..be2062db --- /dev/null +++ b/speechx/examples/ngram/zh/README.md @@ -0,0 +1,101 @@ +# ngram train for mandarin + +Quick run: +``` +bash run.sh --stage -1 +``` + +## input + +input files: +``` +data/ +├── lexicon.txt +├── text +└── vocab.txt +``` + +``` +==> data/text <== +BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购 +BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉 +BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 +BAC009S0002W0125 各地 政府 便 纷纷 跟进 +BAC009S0002W0126 仅 一 个 多 月 的 时间 里 +BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 +BAC009S0002W0128 四十六 个 限 购 城市 当中 +BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购 +BAC009S0002W0130 财政 金融 政策 紧随 其后 而来 +BAC009S0002W0131 显示 出 了 极 强 的 威力 + +==> data/lexicon.txt <== +SIL sil + sil +啊 aa a1 +啊 aa a2 +啊 aa a4 +啊 aa a5 +啊啊啊 aa a2 aa a2 aa a2 +啊啊啊 aa a5 aa a5 aa a5 +坐地 z uo4 d i4 +坐实 z uo4 sh ix2 +坐视 z uo4 sh ix4 +坐稳 z uo4 uu un3 +坐拥 z uo4 ii iong1 +坐诊 z uo4 zh en3 +坐庄 z uo4 zh uang1 +坐姿 z uo4 z iy1 + +==> data/vocab.txt <== + + +A +B +C +D +E +龙 +龚 +龛 + +``` + +## output + +``` +data/ +├── local +│ ├── dict +│ │ ├── lexicon.txt +│ │ └── units.txt +│ └── lm +│ ├── heldout +│ ├── lm.arpa +│ ├── text +│ ├── text.no_oov +│ ├── train +│ ├── unigram.counts +│ ├── word.counts +│ └── wordlist +``` + +``` +/workspace/srilm/bin/i686-m64/ngram-count +Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt') +Ignoring words 矽, which contains oov unit +Ignoring words 傩, which contains oov unit +Ignoring words 堀, which contains oov unit +Ignoring words 莼, which contains oov unit +Ignoring words 菰, which contains oov unit +Ignoring words 摭, which contains oov unit +Ignoring words 帙, which contains oov unit +Ignoring words 迨, which contains oov unit +Ignoring words 孥, which contains oov unit +Ignoring words 瑗, which contains oov unit +... +... +... +file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs +0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745 +build LM done. +``` \ No newline at end of file diff --git a/speechx/examples/ngram/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh old mode 100644 new mode 100755 similarity index 90% rename from speechx/examples/ngram/local/aishell_train_lms.sh rename to speechx/examples/ngram/zh/local/aishell_train_lms.sh index d9f87aca..e3cee438 --- a/speechx/examples/ngram/local/aishell_train_lms.sh +++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh @@ -28,10 +28,14 @@ mkdir -p $dir cleantext=$dir/text.no_oov +# oov to +# line: utt word0 ... wordn -> line: word0 ... wordn cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ > $cleantext || exit 1; +# compute word counts +# line: count word cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ sort -nr > $dir/word.counts || exit 1; @@ -42,10 +46,13 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; +# word with cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist +# hold out to compute ppl heldout_sent=10000 # Don't change this if you want result to be comparable with # kaldi_lm results + mkdir -p $dir cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout diff --git a/speechx/examples/ngram/zh/path.sh b/speechx/examples/ngram/zh/path.sh new file mode 100644 index 00000000..a3fb3d75 --- /dev/null +++ b/speechx/examples/ngram/zh/path.sh @@ -0,0 +1,12 @@ +# This contains the locations of binarys build required for running the examples. + +MAIN_ROOT=`realpath $PWD/../../../../` +SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx` + +export LC_AL=C + +# srilm +export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs +export SRILM=${MAIN_ROOT}/tools/srilm +export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64 diff --git a/speechx/examples/ngram/zh/run.sh b/speechx/examples/ngram/zh/run.sh new file mode 100755 index 00000000..eda422b3 --- /dev/null +++ b/speechx/examples/ngram/zh/run.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -eo pipefail + +. path.sh + +stage=0 +stop_stage=100 +corpus=aishell + +unit=data/vocab.txt # line: char/spm_pice, vocab file +lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt +text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt + +. utils/parse_options.sh + +data=$PWD/data +mkdir -p $data + +if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then + if [ ! -f $data/speech.ngram.zh.tar.gz ];then + pushd $data + wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz + tar xvzf speech.ngram.zh.tar.gz + popd + fi +fi + +if [ ! -f $unit ]; then + echo "$0: No such file $unit" + exit 1; +fi + +if ! which ngram-count; then + pushd $MAIN_ROOT/tools + make srilm.done + popd +fi + +mkdir -p data/local/dict +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # 7.1 Prepare dict + # line: char/spm_pices + cp $unit data/local/dict/units.txt + + # line: word ph0 ... phn -> line: word char0 ... charn + utils/fst/prepare_dict.py \ + --unit_file $unit \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt +fi + +lm=data/local/lm +mkdir -p $lm + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + # 7.2 Train lm + cp $text $lm/text + local/aishell_train_lms.sh +fi + +echo "build LM done." +exit 0 diff --git a/speechx/examples/ngram/zh/utils b/speechx/examples/ngram/zh/utils new file mode 120000 index 00000000..c2519a9d --- /dev/null +++ b/speechx/examples/ngram/zh/utils @@ -0,0 +1 @@ +../../../../utils/ \ No newline at end of file diff --git a/speechx/examples/text_lm/.gitignore b/speechx/examples/text_lm/.gitignore new file mode 100644 index 00000000..1269488f --- /dev/null +++ b/speechx/examples/text_lm/.gitignore @@ -0,0 +1 @@ +data diff --git a/speechx/examples/text_lm/path.sh b/speechx/examples/text_lm/path.sh new file mode 100644 index 00000000..e69de29b diff --git a/speechx/examples/text_lm/run.sh b/speechx/examples/text_lm/run.sh new file mode 100644 index 00000000..e69de29b diff --git a/speechx/examples/wfst/README.md b/speechx/examples/wfst/README.md new file mode 100644 index 00000000..dd9b926f --- /dev/null +++ b/speechx/examples/wfst/README.md @@ -0,0 +1,18 @@ +``` +fstaddselfloops 'echo 4234 |' 'echo 123660 |' +Lexicon and Token FSTs compiling succeeded +arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true - +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section. +LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section. +Checking how stochastic G is (the first of these numbers should be small): +fstisstochastic data/lang_test/G.fst +0 -1.14386 +fsttablecompose data/lang_test/L.fst data/lang_test/G.fst +fstminimizeencoded +fstdeterminizestar --use-log=true +fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst +Composing decoding graph TLG.fst succeeded +Aishell build TLG done. +``` \ No newline at end of file diff --git a/speechx/examples/build_wfst/path.sh b/speechx/examples/wfst/path.sh similarity index 68% rename from speechx/examples/build_wfst/path.sh rename to speechx/examples/wfst/path.sh index e4008cd2..877f2399 100644 --- a/speechx/examples/build_wfst/path.sh +++ b/speechx/examples/wfst/path.sh @@ -1,18 +1,10 @@ # This contains the locations of binarys build required for running the examples. -SPEECHX_ROOT=$PWD/../../../ -MAIN_ROOT=$SPEECHX_ROOT/../ -SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples - -SPEECHX_TOOLS=$SPEECHX_ROOT/tools -TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin - -[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; } +MAIN_ROOT=`realpath $PWD/../../../../` +SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx` export LC_AL=C -export PATH=$PATH:$TOOLS_BIN - # srilm export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs diff --git a/speechx/examples/build_wfst/run.sh b/speechx/examples/wfst/run.sh similarity index 93% rename from speechx/examples/build_wfst/run.sh rename to speechx/examples/wfst/run.sh index bba14c59..b53e1a5b 100644 --- a/speechx/examples/build_wfst/run.sh +++ b/speechx/examples/wfst/run.sh @@ -13,12 +13,6 @@ text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt source parse_options.sh -if [ ! which ngram-count ]; then - pushd $MAIN_ROOT/tools - make srilm.done - popd -fi - if [ ! which fstprint ]; then pushd $MAIN_ROOT/tools make kaldi.done From a054d1c4524d4aa8cfdba166ae0b2b6cc7eb9562 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 16 Apr 2022 13:24:25 +0000 Subject: [PATCH 09/18] text process for lm --- .../other/ngram_lm/s0/local/build_zh_lm.sh | 2 +- setup.py | 2 +- speechx/examples/ds2_ol/aishell/README.md | 2 +- speechx/examples/ngram/zh/README.md | 2 +- speechx/examples/text_lm/README.md | 15 + speechx/examples/text_lm/path.sh | 4 + speechx/examples/text_lm/run.sh | 24 + speechx/examples/text_lm/utils | 1 + speechx/examples/wfst/README.md | 2 +- utils/compute-wer.py | 965 +++++++++--------- utils/manifest_key_value.py | 5 +- utils/zh_tn.py | 686 +++++++++++-- 12 files changed, 1175 insertions(+), 535 deletions(-) create mode 100644 speechx/examples/text_lm/README.md mode change 100644 => 100755 speechx/examples/text_lm/run.sh create mode 120000 speechx/examples/text_lm/utils diff --git a/examples/other/ngram_lm/s0/local/build_zh_lm.sh b/examples/other/ngram_lm/s0/local/build_zh_lm.sh index 73eb165e..b031371f 100644 --- a/examples/other/ngram_lm/s0/local/build_zh_lm.sh +++ b/examples/other/ngram_lm/s0/local/build_zh_lm.sh @@ -27,7 +27,7 @@ arpa=$3 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then # text tn & wordseg preprocess echo "process text." - python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn + python3 ${MAIN_ROOT}/utils/zh_tn.py --token_type ${type} ${text} ${text}.${type}.tn fi if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then diff --git a/setup.py b/setup.py index 82ff6341..01647a4b 100644 --- a/setup.py +++ b/setup.py @@ -64,6 +64,7 @@ base = [ "webrtcvad", "yacs~=0.1.8", "prettytable", + "zhon", ] server = [ @@ -90,7 +91,6 @@ requirements = { "unidecode", "yq", "pre-commit", - "zhon", ] } diff --git a/speechx/examples/ds2_ol/aishell/README.md b/speechx/examples/ds2_ol/aishell/README.md index 1ee73a33..f4a81516 100644 --- a/speechx/examples/ds2_ol/aishell/README.md +++ b/speechx/examples/ds2_ol/aishell/README.md @@ -24,4 +24,4 @@ LM: aishell train Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819 Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818 Other -> 0.00 % N=0 C=0 S=0 D=0 I=1 -``` \ No newline at end of file +``` diff --git a/speechx/examples/ngram/zh/README.md b/speechx/examples/ngram/zh/README.md index be2062db..e11bd343 100644 --- a/speechx/examples/ngram/zh/README.md +++ b/speechx/examples/ngram/zh/README.md @@ -98,4 +98,4 @@ Ignoring words 瑗, which contains oov unit file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs 0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745 build LM done. -``` \ No newline at end of file +``` diff --git a/speechx/examples/text_lm/README.md b/speechx/examples/text_lm/README.md new file mode 100644 index 00000000..627ed3df --- /dev/null +++ b/speechx/examples/text_lm/README.md @@ -0,0 +1,15 @@ +# Text PreProcess for building ngram LM + +Output `text` file like this: + +``` +BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购 +BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉 +BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 +BAC009S0002W0125 各地 政府 便 纷纷 跟进 +BAC009S0002W0126 仅 一 个 多 月 的 时间 里 +BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 +BAC009S0002W0128 四十六 个 限 购 城市 当中 +BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购 +BAC009S0002W0130 财政 金融 政策 紧随 其后 而来 +``` diff --git a/speechx/examples/text_lm/path.sh b/speechx/examples/text_lm/path.sh index e69de29b..541f852c 100644 --- a/speechx/examples/text_lm/path.sh +++ b/speechx/examples/text_lm/path.sh @@ -0,0 +1,4 @@ +MAIN_ROOT=`realpath $PWD/../../../../` +SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx` + +export LC_AL=C diff --git a/speechx/examples/text_lm/run.sh b/speechx/examples/text_lm/run.sh old mode 100644 new mode 100755 index e69de29b..0a733b49 --- a/speechx/examples/text_lm/run.sh +++ b/speechx/examples/text_lm/run.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -eo pipefail + +. path.sh + +stage=0 +stop_stage=100 +has_key=true +token_type=word + +. utils/parse_options.sh || exit -1; + +text=data/text + +if [ ! -f $text ]; then + echo "$0: Not find $1"; + exit -1; +fi + +if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then + echo "text tn & wordseg preprocess" + rm -rf ${text}.tn + python3 utils/zh_tn.py --has_key $has_key --token_type $token_type ${text} ${text}.tn +fi \ No newline at end of file diff --git a/speechx/examples/text_lm/utils b/speechx/examples/text_lm/utils new file mode 120000 index 00000000..256f914a --- /dev/null +++ b/speechx/examples/text_lm/utils @@ -0,0 +1 @@ +../../../utils/ \ No newline at end of file diff --git a/speechx/examples/wfst/README.md b/speechx/examples/wfst/README.md index dd9b926f..4f862a25 100644 --- a/speechx/examples/wfst/README.md +++ b/speechx/examples/wfst/README.md @@ -15,4 +15,4 @@ fstdeterminizestar --use-log=true fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst Composing decoding graph TLG.fst succeeded Aishell build TLG done. -``` \ No newline at end of file +``` diff --git a/utils/compute-wer.py b/utils/compute-wer.py index 560349a0..978a80c9 100755 --- a/utils/compute-wer.py +++ b/utils/compute-wer.py @@ -1,62 +1,66 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- - # CopyRight WeNet Apache-2.0 License - -import re, sys, unicodedata import codecs +import re +import sys +import unicodedata remove_tag = True -spacelist= [' ', '\t', '\r', '\n'] -puncts = ['!', ',', '?', - '、', '。', '!', ',', ';', '?', - ':', '「', '」', '︰', '『', '』', '《', '》'] +spacelist = [' ', '\t', '\r', '\n'] +puncts = [ + '!', ',', '?', '、', '。', '!', ',', ';', '?', ':', '「', '」', '︰', '『', '』', + '《', '》' +] + + +def characterize(string): + res = [] + i = 0 + while i < len(string): + char = string[i] + if char in puncts: + i += 1 + continue + cat1 = unicodedata.category(char) + #https://unicodebook.readthedocs.io/unicode.html#unicode-categories + if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned + i += 1 + continue + if cat1 == 'Lo': # letter-other + res.append(char) + i += 1 + else: + # some input looks like: , we want to separate it to two words. + sep = ' ' + if char == '<': sep = '>' + j = i + 1 + while j < len(string): + c = string[j] + if ord(c) >= 128 or (c in spacelist) or (c == sep): + break + j += 1 + if j < len(string) and string[j] == '>': + j += 1 + res.append(string[i:j]) + i = j + return res -def characterize(string) : - res = [] - i = 0 - while i < len(string): - char = string[i] - if char in puncts: - i += 1 - continue - cat1 = unicodedata.category(char) - #https://unicodebook.readthedocs.io/unicode.html#unicode-categories - if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist: # space or not assigned - i += 1 - continue - if cat1 == 'Lo': # letter-other - res.append(char) - i += 1 - else: - # some input looks like: , we want to separate it to two words. - sep = ' ' - if char == '<': sep = '>' - j = i+1 - while j < len(string): - c = string[j] - if ord(c) >= 128 or (c in spacelist) or (c==sep): - break - j += 1 - if j < len(string) and string[j] == '>': - j += 1 - res.append(string[i:j]) - i = j - return res def stripoff_tags(x): - if not x: return '' - chars = [] - i = 0; T=len(x) - while i < T: - if x[i] == '<': - while i < T and x[i] != '>': - i += 1 - i += 1 - else: - chars.append(x[i]) - i += 1 - return ''.join(chars) + if not x: return '' + chars = [] + i = 0 + T = len(x) + while i < T: + if x[i] == '<': + while i < T and x[i] != '>': + i += 1 + i += 1 + else: + chars.append(x[i]) + i += 1 + return ''.join(chars) def normalize(sentence, ignore_words, cs, split=None): @@ -66,436 +70,485 @@ def normalize(sentence, ignore_words, cs, split=None): for token in sentence: x = token if not cs: - x = x.upper() + x = x.upper() if x in ignore_words: - continue + continue if remove_tag: - x = stripoff_tags(x) + x = stripoff_tags(x) if not x: - continue + continue if split and x in split: - new_sentence += split[x] + new_sentence += split[x] else: - new_sentence.append(x) + new_sentence.append(x) return new_sentence -class Calculator : - def __init__(self) : - self.data = {} - self.space = [] - self.cost = {} - self.cost['cor'] = 0 - self.cost['sub'] = 1 - self.cost['del'] = 1 - self.cost['ins'] = 1 - def calculate(self, lab, rec) : - # Initialization - lab.insert(0, '') - rec.insert(0, '') - while len(self.space) < len(lab) : - self.space.append([]) - for row in self.space : - for element in row : - element['dist'] = 0 - element['error'] = 'non' - while len(row) < len(rec) : - row.append({'dist' : 0, 'error' : 'non'}) - for i in range(len(lab)) : - self.space[i][0]['dist'] = i - self.space[i][0]['error'] = 'del' - for j in range(len(rec)) : - self.space[0][j]['dist'] = j - self.space[0][j]['error'] = 'ins' - self.space[0][0]['error'] = 'non' - for token in lab : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - for token in rec : - if token not in self.data and len(token) > 0 : - self.data[token] = {'all' : 0, 'cor' : 0, 'sub' : 0, 'ins' : 0, 'del' : 0} - # Computing edit distance - for i, lab_token in enumerate(lab) : - for j, rec_token in enumerate(rec) : - if i == 0 or j == 0 : - continue - min_dist = sys.maxsize - min_error = 'none' - dist = self.space[i-1][j]['dist'] + self.cost['del'] - error = 'del' - if dist < min_dist : - min_dist = dist - min_error = error - dist = self.space[i][j-1]['dist'] + self.cost['ins'] - error = 'ins' - if dist < min_dist : - min_dist = dist - min_error = error - if lab_token == rec_token : - dist = self.space[i-1][j-1]['dist'] + self.cost['cor'] - error = 'cor' - else : - dist = self.space[i-1][j-1]['dist'] + self.cost['sub'] - error = 'sub' - if dist < min_dist : - min_dist = dist - min_error = error - self.space[i][j]['dist'] = min_dist - self.space[i][j]['error'] = min_error - # Tracing back - result = {'lab':[], 'rec':[], 'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - i = len(lab) - 1 - j = len(rec) - 1 - while True : - if self.space[i][j]['error'] == 'cor' : # correct - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 - result['all'] = result['all'] + 1 - result['cor'] = result['cor'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'sub' : # substitution - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 - result['all'] = result['all'] + 1 - result['sub'] = result['sub'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, rec[j]) - i = i - 1 - j = j - 1 - elif self.space[i][j]['error'] == 'del' : # deletion - if len(lab[i]) > 0 : - self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 - self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 - result['all'] = result['all'] + 1 - result['del'] = result['del'] + 1 - result['lab'].insert(0, lab[i]) - result['rec'].insert(0, "") - i = i - 1 - elif self.space[i][j]['error'] == 'ins' : # insertion - if len(rec[j]) > 0 : - self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 - result['ins'] = result['ins'] + 1 - result['lab'].insert(0, "") - result['rec'].insert(0, rec[j]) - j = j - 1 - elif self.space[i][j]['error'] == 'non' : # starting point - break - else : # shouldn't reach here - print('this should not happen , i = {i} , j = {j} , error = {error}'.format(i = i, j = j, error = self.space[i][j]['error'])) - return result - def overall(self) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def cluster(self, data) : - result = {'all':0, 'cor':0, 'sub':0, 'ins':0, 'del':0} - for token in data : - if token in self.data : - result['all'] = result['all'] + self.data[token]['all'] - result['cor'] = result['cor'] + self.data[token]['cor'] - result['sub'] = result['sub'] + self.data[token]['sub'] - result['ins'] = result['ins'] + self.data[token]['ins'] - result['del'] = result['del'] + self.data[token]['del'] - return result - def keys(self) : - return list(self.data.keys()) + +class Calculator: + def __init__(self): + self.data = {} + self.space = [] + self.cost = {} + self.cost['cor'] = 0 + self.cost['sub'] = 1 + self.cost['del'] = 1 + self.cost['ins'] = 1 + + def calculate(self, lab, rec): + # Initialization + lab.insert(0, '') + rec.insert(0, '') + while len(self.space) < len(lab): + self.space.append([]) + for row in self.space: + for element in row: + element['dist'] = 0 + element['error'] = 'non' + while len(row) < len(rec): + row.append({'dist': 0, 'error': 'non'}) + for i in range(len(lab)): + self.space[i][0]['dist'] = i + self.space[i][0]['error'] = 'del' + for j in range(len(rec)): + self.space[0][j]['dist'] = j + self.space[0][j]['error'] = 'ins' + self.space[0][0]['error'] = 'non' + for token in lab: + if token not in self.data and len(token) > 0: + self.data[token] = { + 'all': 0, + 'cor': 0, + 'sub': 0, + 'ins': 0, + 'del': 0 + } + for token in rec: + if token not in self.data and len(token) > 0: + self.data[token] = { + 'all': 0, + 'cor': 0, + 'sub': 0, + 'ins': 0, + 'del': 0 + } + # Computing edit distance + for i, lab_token in enumerate(lab): + for j, rec_token in enumerate(rec): + if i == 0 or j == 0: + continue + min_dist = sys.maxsize + min_error = 'none' + dist = self.space[i - 1][j]['dist'] + self.cost['del'] + error = 'del' + if dist < min_dist: + min_dist = dist + min_error = error + dist = self.space[i][j - 1]['dist'] + self.cost['ins'] + error = 'ins' + if dist < min_dist: + min_dist = dist + min_error = error + if lab_token == rec_token: + dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor'] + error = 'cor' + else: + dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub'] + error = 'sub' + if dist < min_dist: + min_dist = dist + min_error = error + self.space[i][j]['dist'] = min_dist + self.space[i][j]['error'] = min_error + # Tracing back + result = { + 'lab': [], + 'rec': [], + 'all': 0, + 'cor': 0, + 'sub': 0, + 'ins': 0, + 'del': 0 + } + i = len(lab) - 1 + j = len(rec) - 1 + while True: + if self.space[i][j]['error'] == 'cor': # correct + if len(lab[i]) > 0: + self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 + self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1 + result['all'] = result['all'] + 1 + result['cor'] = result['cor'] + 1 + result['lab'].insert(0, lab[i]) + result['rec'].insert(0, rec[j]) + i = i - 1 + j = j - 1 + elif self.space[i][j]['error'] == 'sub': # substitution + if len(lab[i]) > 0: + self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 + self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1 + result['all'] = result['all'] + 1 + result['sub'] = result['sub'] + 1 + result['lab'].insert(0, lab[i]) + result['rec'].insert(0, rec[j]) + i = i - 1 + j = j - 1 + elif self.space[i][j]['error'] == 'del': # deletion + if len(lab[i]) > 0: + self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1 + self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1 + result['all'] = result['all'] + 1 + result['del'] = result['del'] + 1 + result['lab'].insert(0, lab[i]) + result['rec'].insert(0, "") + i = i - 1 + elif self.space[i][j]['error'] == 'ins': # insertion + if len(rec[j]) > 0: + self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1 + result['ins'] = result['ins'] + 1 + result['lab'].insert(0, "") + result['rec'].insert(0, rec[j]) + j = j - 1 + elif self.space[i][j]['error'] == 'non': # starting point + break + else: # shouldn't reach here + print( + 'this should not happen , i = {i} , j = {j} , error = {error}'. + format(i=i, j=j, error=self.space[i][j]['error'])) + return result + + def overall(self): + result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} + for token in self.data: + result['all'] = result['all'] + self.data[token]['all'] + result['cor'] = result['cor'] + self.data[token]['cor'] + result['sub'] = result['sub'] + self.data[token]['sub'] + result['ins'] = result['ins'] + self.data[token]['ins'] + result['del'] = result['del'] + self.data[token]['del'] + return result + + def cluster(self, data): + result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0} + for token in data: + if token in self.data: + result['all'] = result['all'] + self.data[token]['all'] + result['cor'] = result['cor'] + self.data[token]['cor'] + result['sub'] = result['sub'] + self.data[token]['sub'] + result['ins'] = result['ins'] + self.data[token]['ins'] + result['del'] = result['del'] + self.data[token]['del'] + return result + + def keys(self): + return list(self.data.keys()) + def width(string): - return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) + return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string) -def default_cluster(word) : - unicode_names = [ unicodedata.name(char) for char in word ] - for i in reversed(range(len(unicode_names))) : - if unicode_names[i].startswith('DIGIT') : # 1 - unicode_names[i] = 'Number' # 'DIGIT' - elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or - unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')) : - # 明 / 郎 - unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' - elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or - unicode_names[i].startswith('LATIN SMALL LETTER')) : - # A / a - unicode_names[i] = 'English' # 'LATIN LETTER' - elif unicode_names[i].startswith('HIRAGANA LETTER') : # は こ め - unicode_names[i] = 'Japanese' # 'GANA LETTER' - elif (unicode_names[i].startswith('AMPERSAND') or - unicode_names[i].startswith('APOSTROPHE') or - unicode_names[i].startswith('COMMERCIAL AT') or - unicode_names[i].startswith('DEGREE CELSIUS') or - unicode_names[i].startswith('EQUALS SIGN') or - unicode_names[i].startswith('FULL STOP') or - unicode_names[i].startswith('HYPHEN-MINUS') or - unicode_names[i].startswith('LOW LINE') or - unicode_names[i].startswith('NUMBER SIGN') or - unicode_names[i].startswith('PLUS SIGN') or - unicode_names[i].startswith('SEMICOLON')) : - # & / ' / @ / ℃ / = / . / - / _ / # / + / ; - del unicode_names[i] - else : - return 'Other' - if len(unicode_names) == 0 : - return 'Other' - if len(unicode_names) == 1 : - return unicode_names[0] - for i in range(len(unicode_names)-1) : - if unicode_names[i] != unicode_names[i+1] : - return 'Other' - return unicode_names[0] -def usage() : - print("compute-wer.py : compute word error rate (WER) and align recognition results and references.") - print(" usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer") +def default_cluster(word): + unicode_names = [unicodedata.name(char) for char in word] + for i in reversed(range(len(unicode_names))): + if unicode_names[i].startswith('DIGIT'): # 1 + unicode_names[i] = 'Number' # 'DIGIT' + elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or + unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')): + # 明 / 郎 + unicode_names[i] = 'Mandarin' # 'CJK IDEOGRAPH' + elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or + unicode_names[i].startswith('LATIN SMALL LETTER')): + # A / a + unicode_names[i] = 'English' # 'LATIN LETTER' + elif unicode_names[i].startswith('HIRAGANA LETTER'): # は こ め + unicode_names[i] = 'Japanese' # 'GANA LETTER' + elif (unicode_names[i].startswith('AMPERSAND') or + unicode_names[i].startswith('APOSTROPHE') or + unicode_names[i].startswith('COMMERCIAL AT') or + unicode_names[i].startswith('DEGREE CELSIUS') or + unicode_names[i].startswith('EQUALS SIGN') or + unicode_names[i].startswith('FULL STOP') or + unicode_names[i].startswith('HYPHEN-MINUS') or + unicode_names[i].startswith('LOW LINE') or + unicode_names[i].startswith('NUMBER SIGN') or + unicode_names[i].startswith('PLUS SIGN') or + unicode_names[i].startswith('SEMICOLON')): + # & / ' / @ / ℃ / = / . / - / _ / # / + / ; + del unicode_names[i] + else: + return 'Other' + if len(unicode_names) == 0: + return 'Other' + if len(unicode_names) == 1: + return unicode_names[0] + for i in range(len(unicode_names) - 1): + if unicode_names[i] != unicode_names[i + 1]: + return 'Other' + return unicode_names[0] + + +def usage(): + print( + "compute-wer.py : compute word error rate (WER) and align recognition results and references." + ) + print( + " usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer" + ) + if __name__ == '__main__': - if len(sys.argv) == 1 : - usage() - sys.exit(0) - calculator = Calculator() - cluster_file = '' - ignore_words = set() - tochar = False - verbose= 1 - padding_symbol= ' ' - case_sensitive = False - max_words_per_line = sys.maxsize - split = None - while len(sys.argv) > 3: - a = '--maxw=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):] - del sys.argv[1] - max_words_per_line = int(b) - continue - a = '--rt=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - remove_tag = (b == 'true') or (b != '0') - continue - a = '--cs=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - case_sensitive = (b == 'true') or (b != '0') - continue - a = '--cluster=' - if sys.argv[1].startswith(a): - cluster_file = sys.argv[1][len(a):] - del sys.argv[1] - continue - a = '--splitfile=' - if sys.argv[1].startswith(a): - split_file = sys.argv[1][len(a):] - del sys.argv[1] - split = dict() - with codecs.open(split_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - words = line.strip().split() - if len(words) >= 2: - split[words[0]] = words[1:] - continue - a = '--ig=' - if sys.argv[1].startswith(a): - ignore_file = sys.argv[1][len(a):] - del sys.argv[1] - with codecs.open(ignore_file, 'r', 'utf-8') as fh: - for line in fh: # line in unicode - line = line.strip() - if len(line) > 0: - ignore_words.add(line) - continue - a = '--char=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - tochar = (b == 'true') or (b != '0') - continue - a = '--v=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - verbose=0 - try: - verbose=int(b) - except: - if b == 'true' or b != '0': - verbose = 1 - continue - a = '--padding-symbol=' - if sys.argv[1].startswith(a): - b = sys.argv[1][len(a):].lower() - del sys.argv[1] - if b == 'space': - padding_symbol= ' ' - elif b == 'underline': - padding_symbol= '_' - continue - if True or sys.argv[1].startswith('-'): - #ignore invalid switch - del sys.argv[1] - continue + if len(sys.argv) == 1: + usage() + sys.exit(0) + calculator = Calculator() + cluster_file = '' + ignore_words = set() + tochar = False + verbose = 1 + padding_symbol = ' ' + case_sensitive = False + max_words_per_line = sys.maxsize + split = None + while len(sys.argv) > 3: + a = '--maxw=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):] + del sys.argv[1] + max_words_per_line = int(b) + continue + a = '--rt=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + remove_tag = (b == 'true') or (b != '0') + continue + a = '--cs=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + case_sensitive = (b == 'true') or (b != '0') + continue + a = '--cluster=' + if sys.argv[1].startswith(a): + cluster_file = sys.argv[1][len(a):] + del sys.argv[1] + continue + a = '--splitfile=' + if sys.argv[1].startswith(a): + split_file = sys.argv[1][len(a):] + del sys.argv[1] + split = dict() + with codecs.open(split_file, 'r', 'utf-8') as fh: + for line in fh: # line in unicode + words = line.strip().split() + if len(words) >= 2: + split[words[0]] = words[1:] + continue + a = '--ig=' + if sys.argv[1].startswith(a): + ignore_file = sys.argv[1][len(a):] + del sys.argv[1] + with codecs.open(ignore_file, 'r', 'utf-8') as fh: + for line in fh: # line in unicode + line = line.strip() + if len(line) > 0: + ignore_words.add(line) + continue + a = '--char=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + tochar = (b == 'true') or (b != '0') + continue + a = '--v=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + verbose = 0 + try: + verbose = int(b) + except: + if b == 'true' or b != '0': + verbose = 1 + continue + a = '--padding-symbol=' + if sys.argv[1].startswith(a): + b = sys.argv[1][len(a):].lower() + del sys.argv[1] + if b == 'space': + padding_symbol = ' ' + elif b == 'underline': + padding_symbol = '_' + continue + if True or sys.argv[1].startswith('-'): + #ignore invalid switch + del sys.argv[1] + continue - if not case_sensitive: - ig=set([w.upper() for w in ignore_words]) - ignore_words = ig + if not case_sensitive: + ig = set([w.upper() for w in ignore_words]) + ignore_words = ig - default_clusters = {} - default_words = {} + default_clusters = {} + default_words = {} - ref_file = sys.argv[1] - hyp_file = sys.argv[2] - rec_set = {} - if split and not case_sensitive: - newsplit = dict() - for w in split: - words = split[w] - for i in range(len(words)): - words[i] = words[i].upper() - newsplit[w.upper()] = words - split = newsplit + ref_file = sys.argv[1] + hyp_file = sys.argv[2] + rec_set = {} + if split and not case_sensitive: + newsplit = dict() + for w in split: + words = split[w] + for i in range(len(words)): + words[i] = words[i].upper() + newsplit[w.upper()] = words + split = newsplit - with codecs.open(hyp_file, 'r', 'utf-8') as fh: - for line in fh: + with codecs.open(hyp_file, 'r', 'utf-8') as fh: + for line in fh: + if tochar: + array = characterize(line) + else: + array = line.strip().split() + if len(array) == 0: continue + fid = array[0] + rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, + split) + + # compute error rate on the interaction of reference file and hyp file + for line in open(ref_file, 'r', encoding='utf-8'): if tochar: array = characterize(line) else: - array = line.strip().split() - if len(array)==0: continue + array = line.rstrip('\n').split() + if len(array) == 0: continue fid = array[0] - rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) + if fid not in rec_set: + continue + lab = normalize(array[1:], ignore_words, case_sensitive, split) + rec = rec_set[fid] + if verbose: + print('\nutt: %s' % fid) - # compute error rate on the interaction of reference file and hyp file - for line in open(ref_file, 'r', encoding='utf-8') : - if tochar: - array = characterize(line) - else: - array = line.rstrip('\n').split() - if len(array)==0: continue - fid = array[0] - if fid not in rec_set: - continue - lab = normalize(array[1:], ignore_words, case_sensitive, split) - rec = rec_set[fid] - if verbose: - print('\nutt: %s' % fid) + for word in rec + lab: + if word not in default_words: + default_cluster_name = default_cluster(word) + if default_cluster_name not in default_clusters: + default_clusters[default_cluster_name] = {} + if word not in default_clusters[default_cluster_name]: + default_clusters[default_cluster_name][word] = 1 + default_words[word] = default_cluster_name - for word in rec + lab : - if word not in default_words : - default_cluster_name = default_cluster(word) - if default_cluster_name not in default_clusters : - default_clusters[default_cluster_name] = {} - if word not in default_clusters[default_cluster_name] : - default_clusters[default_cluster_name][word] = 1 - default_words[word] = default_cluster_name + result = calculator.calculate(lab, rec) + if verbose: + if result['all'] != 0: + wer = float(result['ins'] + result['sub'] + result[ + 'del']) * 100.0 / result['all'] + else: + wer = 0.0 + print('WER: %4.2f %%' % wer, end=' ') + print('N=%d C=%d S=%d D=%d I=%d' % + (result['all'], result['cor'], result['sub'], result['del'], + result['ins'])) + space = {} + space['lab'] = [] + space['rec'] = [] + for idx in range(len(result['lab'])): + len_lab = width(result['lab'][idx]) + len_rec = width(result['rec'][idx]) + length = max(len_lab, len_rec) + space['lab'].append(length - len_lab) + space['rec'].append(length - len_rec) + upper_lab = len(result['lab']) + upper_rec = len(result['rec']) + lab1, rec1 = 0, 0 + while lab1 < upper_lab or rec1 < upper_rec: + if verbose > 1: + print('lab(%s):' % fid.encode('utf-8'), end=' ') + else: + print('lab:', end=' ') + lab2 = min(upper_lab, lab1 + max_words_per_line) + for idx in range(lab1, lab2): + token = result['lab'][idx] + print('{token}'.format(token=token), end='') + for n in range(space['lab'][idx]): + print(padding_symbol, end='') + print(' ', end='') + print() + if verbose > 1: + print('rec(%s):' % fid.encode('utf-8'), end=' ') + else: + print('rec:', end=' ') + rec2 = min(upper_rec, rec1 + max_words_per_line) + for idx in range(rec1, rec2): + token = result['rec'][idx] + print('{token}'.format(token=token), end='') + for n in range(space['rec'][idx]): + print(padding_symbol, end='') + print(' ', end='') + print('\n', end='\n') + lab1 = lab2 + rec1 = rec2 - result = calculator.calculate(lab, rec) if verbose: - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('WER: %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - space = {} - space['lab'] = [] - space['rec'] = [] - for idx in range(len(result['lab'])) : - len_lab = width(result['lab'][idx]) - len_rec = width(result['rec'][idx]) - length = max(len_lab, len_rec) - space['lab'].append(length-len_lab) - space['rec'].append(length-len_rec) - upper_lab = len(result['lab']) - upper_rec = len(result['rec']) - lab1, rec1 = 0, 0 - while lab1 < upper_lab or rec1 < upper_rec: - if verbose > 1: - print('lab(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('lab:', end = ' ') - lab2 = min(upper_lab, lab1 + max_words_per_line) - for idx in range(lab1, lab2): - token = result['lab'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['lab'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print() - if verbose > 1: - print('rec(%s):' % fid.encode('utf-8'), end = ' ') - else: - print('rec:', end = ' ') - rec2 = min(upper_rec, rec1 + max_words_per_line) - for idx in range(rec1, rec2): - token = result['rec'][idx] - print('{token}'.format(token = token), end = '') - for n in range(space['rec'][idx]) : - print(padding_symbol, end = '') - print(' ',end='') - print('\n', end='\n') - lab1 = lab2 - rec1 = rec2 - - if verbose: - print('===========================================================================') - print() - - result = calculator.overall() - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('Overall -> %4.2f %%' % wer, end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if not verbose: - print() + print( + '===========================================================================' + ) + print() - if verbose: - for cluster_id in default_clusters : - result = calculator.cluster([ k for k in default_clusters[cluster_id] ]) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : + result = calculator.overall() + if result['all'] != 0: + wer = float(result['ins'] + result['sub'] + result[ + 'del']) * 100.0 / result['all'] + else: wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - if len(cluster_file) > 0 : # compute separated WERs for word clusters - cluster_id = '' - cluster = [] - for line in open(cluster_file, 'r', encoding='utf-8') : - for token in line.decode('utf-8').rstrip('\n').split() : - # end of cluster reached, like - if token[0:2] == '' and \ - token.lstrip('') == cluster_id : - result = calculator.cluster(cluster) - if result['all'] != 0 : - wer = float(result['ins'] + result['sub'] + result['del']) * 100.0 / result['all'] - else : - wer = 0.0 - print('%s -> %4.2f %%' % (cluster_id, wer), end = ' ') - print('N=%d C=%d S=%d D=%d I=%d' % - (result['all'], result['cor'], result['sub'], result['del'], result['ins'])) - cluster_id = '' - cluster = [] - # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ - cluster_id == '' : - cluster_id = token.lstrip('<').rstrip('>') - cluster = [] - # general terms, like WEATHER / CAR / ... - else : - cluster.append(token) - print() - print('===========================================================================') + print('Overall -> %4.2f %%' % wer, end=' ') + print('N=%d C=%d S=%d D=%d I=%d' % + (result['all'], result['cor'], result['sub'], result['del'], + result['ins'])) + if not verbose: + print() + + if verbose: + for cluster_id in default_clusters: + result = calculator.cluster( + [k for k in default_clusters[cluster_id]]) + if result['all'] != 0: + wer = float(result['ins'] + result['sub'] + result[ + 'del']) * 100.0 / result['all'] + else: + wer = 0.0 + print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') + print('N=%d C=%d S=%d D=%d I=%d' % + (result['all'], result['cor'], result['sub'], result['del'], + result['ins'])) + if len(cluster_file) > 0: # compute separated WERs for word clusters + cluster_id = '' + cluster = [] + for line in open(cluster_file, 'r', encoding='utf-8'): + for token in line.decode('utf-8').rstrip('\n').split(): + # end of cluster reached, like + if token[0:2] == '' and \ + token.lstrip('') == cluster_id : + result = calculator.cluster(cluster) + if result['all'] != 0: + wer = float(result['ins'] + result['sub'] + result[ + 'del']) * 100.0 / result['all'] + else: + wer = 0.0 + print('%s -> %4.2f %%' % (cluster_id, wer), end=' ') + print('N=%d C=%d S=%d D=%d I=%d' % + (result['all'], result['cor'], result['sub'], + result['del'], result['ins'])) + cluster_id = '' + cluster = [] + # begin of cluster reached, like + elif token[0] == '<' and token[len(token)-1] == '>' and \ + cluster_id == '' : + cluster_id = token.lstrip('<').rstrip('>') + cluster = [] + # general terms, like WEATHER / CAR / ... + else: + cluster.append(token) + print() + print( + '===========================================================================' + ) diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py index 0ab3ae08..3825fb9b 100755 --- a/utils/manifest_key_value.py +++ b/utils/manifest_key_value.py @@ -34,7 +34,7 @@ def main(args): utt2spk = line_json['utt2spk'] # input - assert(len(line_json['input']) == 1), "only support one input now" + assert (len(line_json['input']) == 1), "only support one input now" input_json = line_json['input'][0] feat = input_json['feat'] feat_shape = input_json['shape'] @@ -49,7 +49,8 @@ def main(args): fdur.write(f"{utt} {dur}\n") # output - assert(len(line_json['output']) == 1), "only support one output now" + assert ( + len(line_json['output']) == 1), "only support one output now" output_json = line_json['output'][0] text = output_json['text'] if 'token' in output_json: diff --git a/utils/zh_tn.py b/utils/zh_tn.py index 4dcf2743..73bb8af2 100755 --- a/utils/zh_tn.py +++ b/utils/zh_tn.py @@ -4,6 +4,7 @@ import argparse import re import string import sys +import unicodedata from typing import List from typing import Text @@ -33,6 +34,14 @@ POINT = [u'点', u'點'] # PLUS = [u'加', u'加'] # SIL = [u'杠', u'槓'] +FILLER_CHARS = ['呃', '啊'] + +ER_WHITELIST = '(儿女|儿子|儿孙|女儿|儿媳|妻儿|' \ + '胎儿|婴儿|新生儿|婴幼儿|幼儿|少儿|小儿|儿歌|儿童|儿科|托儿所|孤儿|' \ + '儿戏|儿化|台儿庄|鹿儿岛|正儿八经|吊儿郎当|生儿育女|托儿带女|养儿防老|痴儿呆女|' \ + '佳儿佳妇|儿怜兽扰|儿无常父|儿不嫌母丑|儿行千里母担忧|儿大不由爷|苏乞儿)' +ER_WHITELIST_PATTERN = re.compile(ER_WHITELIST) + # 中文数字系统类型 NUMBERING_TYPES = ['low', 'mid', 'high'] @@ -48,15 +57,330 @@ COM_QUANTIFIERS = '(匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘| # punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git) CHINESE_PUNC_STOP = '!?。。' -CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏' -CHINESE_PUNC_OTHER = '·〈〉-' -CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + CHINESE_PUNC_OTHER +CHINESE_PUNC_NON_STOP = '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏·〈〉-' +CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP + +# https://zh.wikipedia.org/wiki/全行和半行 +QJ2BJ = { + ' ': ' ', + '!': '!', + '"': '"', + '#': '#', + '$': '$', + '%': '%', + '&': '&', + ''': "'", + '(': '(', + ')': ')', + '*': '*', + '+': '+', + ',': ',', + '-': '-', + '.': '.', + '/': '/', + '0': '0', + '1': '1', + '2': '2', + '3': '3', + '4': '4', + '5': '5', + '6': '6', + '7': '7', + '8': '8', + '9': '9', + ':': ':', + ';': ';', + '<': '<', + '=': '=', + '>': '>', + '?': '?', + '@': '@', + 'A': 'A', + 'B': 'B', + 'C': 'C', + 'D': 'D', + 'E': 'E', + 'F': 'F', + 'G': 'G', + 'H': 'H', + 'I': 'I', + 'J': 'J', + 'K': 'K', + 'L': 'L', + 'M': 'M', + 'N': 'N', + 'O': 'O', + 'P': 'P', + 'Q': 'Q', + 'R': 'R', + 'S': 'S', + 'T': 'T', + 'U': 'U', + 'V': 'V', + 'W': 'W', + 'X': 'X', + 'Y': 'Y', + 'Z': 'Z', + '[': '[', + '\': '\\', + ']': ']', + '^': '^', + '_': '_', + '`': '`', + 'a': 'a', + 'b': 'b', + 'c': 'c', + 'd': 'd', + 'e': 'e', + 'f': 'f', + 'g': 'g', + 'h': 'h', + 'i': 'i', + 'j': 'j', + 'k': 'k', + 'l': 'l', + 'm': 'm', + 'n': 'n', + 'o': 'o', + 'p': 'p', + 'q': 'q', + 'r': 'r', + 's': 's', + 't': 't', + 'u': 'u', + 'v': 'v', + 'w': 'w', + 'x': 'x', + 'y': 'y', + 'z': 'z', + '{': '{', + '|': '|', + '}': '}', + '~': '~', +} + +QJ2BJ_transform = str.maketrans(''.join(QJ2BJ.keys()), ''.join(QJ2BJ.values()), + '') + +# char set +DIGIT_CHARS = '0123456789' + +EN_CHARS = ('abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') + +# 2013 China National Standard: https://zh.wikipedia.org/wiki/通用规范汉字表 +# raw resources from: https://github.com/mozillazg/pinyin-data/blob/master/kMandarin_8105.txt , with total 8105 chars +CN_CHARS = ('一丁七万丈三上下不与丏丐丑专且丕世丘丙业丛东丝丞丢两严丧个丫中丰串临丸丹为主丽举' + '乂乃久么义之乌乍乎乏乐乒乓乔乖乘乙乜九乞也习乡书乩买乱乳乸乾了予争事二亍于亏云互' + '亓五井亘亚些亟亡亢交亥亦产亨亩享京亭亮亲亳亵亶亸亹人亿什仁仂仃仄仅仆仇仉今介仍从' + '仑仓仔仕他仗付仙仝仞仟仡代令以仨仪仫们仰仲仳仵件价任份仿企伈伉伊伋伍伎伏伐休众优' + '伙会伛伞伟传伢伣伤伥伦伧伪伫伭伯估伲伴伶伸伺似伽伾佁佃但位低住佐佑体何佖佗佘余佚' + '佛作佝佞佟你佣佤佥佩佬佯佰佳佴佶佸佺佻佼佽佾使侁侂侃侄侈侉例侍侏侑侔侗侘供依侠侣' + '侥侦侧侨侩侪侬侮侯侴侵侹便促俄俅俊俍俎俏俐俑俗俘俙俚俜保俞俟信俣俦俨俩俪俫俭修俯' + '俱俳俵俶俸俺俾倌倍倏倒倓倔倕倘候倚倜倞借倡倥倦倧倨倩倪倬倭倮倴债倻值倾偁偃假偈偌' + '偎偏偓偕做停偡健偬偭偰偲偶偷偻偾偿傀傃傅傈傉傍傒傕傣傥傧储傩催傲傺傻僇僎像僔僖僚' + '僦僧僬僭僮僰僳僵僻儆儇儋儒儡儦儳儴儿兀允元兄充兆先光克免兑兔兕兖党兜兢入全八公六' + '兮兰共关兴兵其具典兹养兼兽冀冁内冈冉册再冏冒冔冕冗写军农冠冢冤冥冬冮冯冰冱冲决况' + '冶冷冻冼冽净凄准凇凉凋凌减凑凓凘凛凝几凡凤凫凭凯凰凳凶凸凹出击凼函凿刀刁刃分切刈' + '刊刍刎刑划刖列刘则刚创初删判刨利别刬刭刮到刳制刷券刹刺刻刽刿剀剁剂剃剅削剋剌前剐' + '剑剔剕剖剜剞剟剡剥剧剩剪副割剽剿劁劂劄劈劐劓力劝办功加务劢劣动助努劫劬劭励劲劳劼' + '劾势勃勇勉勋勍勐勒勔勖勘勚募勠勤勰勺勾勿匀包匆匈匍匏匐匕化北匙匜匝匠匡匣匦匪匮匹' + '区医匼匾匿十千卅升午卉半华协卑卒卓单卖南博卜卞卟占卡卢卣卤卦卧卫卬卮卯印危即却卵' + '卷卸卺卿厂厄厅历厉压厌厍厕厖厘厚厝原厢厣厥厦厨厩厮去厾县叁参叆叇又叉及友双反发叔' + '叕取受变叙叚叛叟叠口古句另叨叩只叫召叭叮可台叱史右叵叶号司叹叻叼叽吁吃各吆合吉吊' + '同名后吏吐向吒吓吕吖吗君吝吞吟吠吡吣否吧吨吩含听吭吮启吱吲吴吵吸吹吻吼吽吾呀呃呆' + '呇呈告呋呐呒呓呔呕呖呗员呙呛呜呢呣呤呦周呱呲味呵呶呷呸呻呼命咀咂咄咆咇咉咋和咍咎' + '咏咐咒咔咕咖咙咚咛咝咡咣咤咥咦咧咨咩咪咫咬咯咱咳咴咸咺咻咽咿哀品哂哃哄哆哇哈哉哌' + '响哎哏哐哑哒哓哔哕哗哙哚哝哞哟哢哥哦哧哨哩哪哭哮哱哲哳哺哼哽哿唁唆唇唉唏唐唑唔唛' + '唝唠唢唣唤唧唪唬售唯唰唱唳唵唷唼唾唿啁啃啄商啉啊啐啕啖啜啡啤啥啦啧啪啫啬啭啮啰啴' + '啵啶啷啸啻啼啾喀喁喂喃善喆喇喈喉喊喋喏喑喔喘喙喜喝喟喤喧喱喳喵喷喹喻喽喾嗄嗅嗉嗌' + '嗍嗐嗑嗒嗓嗔嗖嗜嗝嗞嗟嗡嗣嗤嗥嗦嗨嗪嗫嗬嗯嗲嗳嗵嗷嗽嗾嘀嘁嘈嘉嘌嘎嘏嘘嘚嘛嘞嘟嘡' + '嘣嘤嘧嘬嘭嘱嘲嘴嘶嘹嘻嘿噀噂噇噌噍噎噔噗噘噙噜噢噤器噩噪噫噬噱噶噻噼嚄嚅嚆嚎嚏嚓' + '嚚嚣嚭嚯嚷嚼囊囔囚四回囟因囡团囤囫园困囱围囵囷囹固国图囿圃圄圆圈圉圊圌圐圙圜土圢' + '圣在圩圪圫圬圭圮圯地圲圳圹场圻圾址坂均坉坊坋坌坍坎坏坐坑坒块坚坛坜坝坞坟坠坡坤坥' + '坦坨坩坪坫坬坭坯坰坳坷坻坼坽垂垃垄垆垈型垌垍垎垏垒垓垕垙垚垛垞垟垠垡垢垣垤垦垧垩' + '垫垭垮垯垱垲垴垵垸垺垾垿埂埃埆埇埋埌城埏埒埔埕埗埘埙埚埝域埠埤埪埫埭埯埴埵埸培基' + '埼埽堂堃堆堇堉堋堌堍堎堐堑堕堙堞堠堡堤堧堨堪堰堲堵堼堽堾塄塅塆塌塍塑塔塘塝塞塥填' + '塬塱塾墀墁境墅墈墉墐墒墓墕墘墙墚增墟墡墣墦墨墩墼壁壅壑壕壤士壬壮声壳壶壸壹处备复' + '夏夐夔夕外夙多夜够夤夥大天太夫夬夭央夯失头夷夸夹夺夼奁奂奄奇奈奉奋奎奏契奓奔奕奖' + '套奘奚奠奡奢奥奭女奴奶奸她好妁如妃妄妆妇妈妊妍妒妓妖妗妘妙妞妣妤妥妧妨妩妪妫妭妮' + '妯妲妹妻妾姆姈姊始姐姑姒姓委姗姘姚姜姝姞姣姤姥姨姬姮姱姶姹姻姽姿娀威娃娄娅娆娇娈' + '娉娌娑娓娘娜娟娠娣娥娩娱娲娴娵娶娼婀婆婉婊婌婍婕婘婚婞婠婢婤婧婪婫婳婴婵婶婷婺婻' + '婼婿媂媄媆媒媓媖媚媛媞媪媭媱媲媳媵媸媾嫁嫂嫄嫉嫌嫒嫔嫕嫖嫘嫚嫜嫠嫡嫣嫦嫩嫪嫫嫭嫱' + '嫽嬉嬖嬗嬛嬥嬬嬴嬷嬿孀孅子孑孓孔孕孖字存孙孚孛孜孝孟孢季孤孥学孩孪孬孰孱孳孵孺孽' + '宁它宄宅宇守安宋完宏宓宕宗官宙定宛宜宝实宠审客宣室宥宦宧宪宫宬宰害宴宵家宸容宽宾' + '宿寁寂寄寅密寇富寐寒寓寝寞察寡寤寥寨寮寰寸对寺寻导寿封射将尉尊小少尔尕尖尘尚尜尝' + '尢尤尥尧尨尪尬就尴尸尹尺尻尼尽尾尿局屁层屃居屈屉届屋屎屏屐屑展屙属屠屡屣履屦屯山' + '屹屺屼屾屿岁岂岈岊岌岍岐岑岔岖岗岘岙岚岛岜岞岠岢岣岨岩岫岬岭岱岳岵岷岸岽岿峁峂峃' + '峄峋峒峗峘峙峛峡峣峤峥峦峧峨峪峭峰峱峻峿崀崁崂崃崄崆崇崌崎崒崔崖崚崛崞崟崡崤崦崧' + '崩崭崮崴崶崽崾崿嵁嵅嵇嵊嵋嵌嵎嵖嵘嵚嵛嵝嵩嵫嵬嵯嵲嵴嶂嶅嶍嶒嶓嶙嶝嶟嶦嶲嶷巅巇巉' + '巍川州巡巢工左巧巨巩巫差巯己已巳巴巷巽巾币市布帅帆师希帏帐帑帔帕帖帘帙帚帛帜帝帡' + '带帧帨席帮帱帷常帻帼帽幂幄幅幌幔幕幖幛幞幡幢幪干平年并幸幺幻幼幽广庄庆庇床庋序庐' + '庑库应底庖店庙庚府庞废庠庤庥度座庭庱庳庵庶康庸庹庼庾廆廉廊廋廑廒廓廖廙廛廨廪延廷' + '建廿开弁异弃弄弆弇弈弊弋式弑弓引弗弘弛弟张弢弥弦弧弨弩弭弯弱弶弸弹强弼彀归当录彖' + '彗彘彝彟形彤彦彧彩彪彬彭彰影彳彷役彻彼往征徂径待徇很徉徊律徐徒徕得徘徙徛徜御徨循' + '徭微徵德徼徽心必忆忉忌忍忏忐忑忒忖志忘忙忝忞忠忡忤忧忪快忭忮忱忳念忸忺忻忽忾忿怀' + '态怂怃怄怅怆怊怍怎怏怒怔怕怖怙怛怜思怠怡急怦性怨怩怪怫怯怵总怼怿恁恂恃恋恍恐恒恓' + '恔恕恙恚恝恢恣恤恧恨恩恪恫恬恭息恰恳恶恸恹恺恻恼恽恿悃悄悆悈悉悌悍悒悔悖悚悛悝悟' + '悠悢患悦您悫悬悭悯悰悱悲悴悸悻悼情惆惇惊惋惎惑惔惕惘惙惚惛惜惝惟惠惦惧惨惩惫惬惭' + '惮惯惰想惴惶惹惺愀愁愃愆愈愉愍愎意愐愔愕愚感愠愣愤愦愧愫愭愿慆慈慊慌慎慑慕慝慢慥' + '慧慨慬慭慰慵慷憋憎憔憕憙憧憨憩憬憭憷憺憾懂懈懊懋懑懒懔懦懵懿戆戈戊戋戌戍戎戏成我' + '戒戕或戗战戚戛戟戡戢戣戤戥截戬戭戮戳戴户戽戾房所扁扂扃扅扆扇扈扉扊手才扎扑扒打扔' + '托扛扞扣扦执扩扪扫扬扭扮扯扰扳扶批扺扼扽找承技抃抄抉把抑抒抓抔投抖抗折抚抛抟抠抡' + '抢护报抨披抬抱抵抹抻押抽抿拂拃拄担拆拇拈拉拊拌拍拎拐拒拓拔拖拗拘拙招拜拟拢拣拤拥' + '拦拧拨择括拭拮拯拱拳拴拶拷拼拽拾拿持挂指挈按挎挑挓挖挚挛挝挞挟挠挡挣挤挥挦挨挪挫' + '振挲挹挺挽捂捃捅捆捉捋捌捍捎捏捐捕捞损捡换捣捧捩捭据捯捶捷捺捻捽掀掂掇授掉掊掌掎' + '掏掐排掖掘掞掠探掣接控推掩措掬掭掮掰掳掴掷掸掺掼掾揄揆揉揍描提插揕揖揠握揣揩揪揭' + '揳援揶揸揽揿搀搁搂搅搋搌搏搐搒搓搔搛搜搞搠搡搦搪搬搭搴携搽摁摄摅摆摇摈摊摏摒摔摘' + '摛摞摧摩摭摴摸摹摽撂撄撅撇撑撒撕撖撙撞撤撩撬播撮撰撵撷撸撺撼擀擂擅操擎擐擒擘擞擢' + '擤擦擿攀攉攒攘攥攫攮支收攸改攻攽放政故效敉敌敏救敔敕敖教敛敝敞敢散敦敩敫敬数敲整' + '敷文斋斌斐斑斓斗料斛斜斝斟斠斡斤斥斧斩斫断斯新斶方於施旁旃旄旅旆旋旌旎族旐旒旖旗' + '旞无既日旦旧旨早旬旭旮旯旰旱旴旵时旷旸旺旻旿昀昂昃昄昆昇昈昉昊昌明昏昒易昔昕昙昝' + '星映昡昣昤春昧昨昪昫昭是昱昳昴昵昶昺昼昽显晁晃晅晊晋晌晏晐晒晓晔晕晖晗晙晚晞晟晡' + '晢晤晦晨晪晫普景晰晱晴晶晷智晾暂暄暅暇暌暑暕暖暗暝暧暨暮暲暴暵暶暹暾暿曈曌曙曛曜' + '曝曦曩曰曲曳更曷曹曼曾替最月有朋服朏朐朓朔朕朗望朝期朦木未末本札术朱朳朴朵朸机朽' + '杀杂权杄杆杈杉杌李杏材村杓杕杖杙杜杞束杠条来杧杨杩杪杭杯杰杲杳杵杷杻杼松板极构枅' + '枇枉枋枍析枕林枘枚果枝枞枢枣枥枧枨枪枫枭枯枰枲枳枵架枷枸枹柁柃柄柈柊柏某柑柒染柔' + '柖柘柙柚柜柝柞柠柢查柩柬柯柰柱柳柴柷柽柿栀栅标栈栉栊栋栌栎栏栐树栒栓栖栗栝栟校栩' + '株栲栳栴样核根栻格栽栾桀桁桂桃桄桅框案桉桊桌桎桐桑桓桔桕桠桡桢档桤桥桦桧桨桩桫桯' + '桲桴桶桷桹梁梃梅梆梌梏梓梗梠梢梣梦梧梨梭梯械梳梴梵梼梽梾梿检棁棂棉棋棍棐棒棓棕棘' + '棚棠棣棤棨棪棫棬森棰棱棵棹棺棻棼棽椀椁椅椆椋植椎椐椑椒椓椟椠椤椪椭椰椴椸椹椽椿楂' + '楒楔楗楙楚楝楞楠楣楦楩楪楫楮楯楷楸楹楼概榃榄榅榆榇榈榉榍榑榔榕榖榛榜榧榨榫榭榰榱' + '榴榷榻槁槃槊槌槎槐槔槚槛槜槟槠槭槱槲槽槿樊樗樘樟模樨横樯樱樵樽樾橄橇橐橑橘橙橛橞' + '橡橥橦橱橹橼檀檄檎檐檑檗檞檠檩檫檬櫆欂欠次欢欣欤欧欲欸欹欺欻款歃歅歆歇歉歌歙止正' + '此步武歧歪歹死歼殁殂殃殄殆殇殉殊残殍殒殓殖殚殛殡殣殪殳殴段殷殿毁毂毅毋毌母每毐毒' + '毓比毕毖毗毙毛毡毪毫毯毳毵毹毽氅氆氇氍氏氐民氓气氕氖氘氙氚氛氟氡氢氤氦氧氨氩氪氮' + '氯氰氲水永氾氿汀汁求汆汇汈汉汊汋汐汔汕汗汛汜汝汞江池污汤汧汨汩汪汫汭汰汲汴汶汹汽' + '汾沁沂沃沄沅沆沇沈沉沌沏沐沓沔沘沙沚沛沟没沣沤沥沦沧沨沩沪沫沭沮沱河沸油沺治沼沽' + '沾沿泂泃泄泅泇泉泊泌泐泓泔法泖泗泙泚泛泜泞泠泡波泣泥注泪泫泮泯泰泱泳泵泷泸泺泻泼' + '泽泾洁洄洇洈洋洌洎洑洒洓洗洘洙洚洛洞洢洣津洧洨洪洫洭洮洱洲洳洴洵洸洹洺活洼洽派洿' + '流浃浅浆浇浈浉浊测浍济浏浐浑浒浓浔浕浙浚浛浜浞浟浠浡浣浥浦浩浪浬浭浮浯浰浲浴海浸' + '浼涂涄涅消涉涌涍涎涐涑涓涔涕涘涛涝涞涟涠涡涢涣涤润涧涨涩涪涫涮涯液涴涵涸涿淀淄淅' + '淆淇淋淌淏淑淖淘淙淜淝淞淟淠淡淤淦淫淬淮淯深淳淴混淹添淼清渊渌渍渎渐渑渔渗渚渝渟' + '渠渡渣渤渥温渫渭港渰渲渴游渺渼湃湄湉湍湎湑湓湔湖湘湛湜湝湟湣湫湮湲湴湾湿溁溃溅溆' + '溇溉溍溏源溘溚溜溞溟溠溢溥溦溧溪溯溱溲溴溵溶溷溹溺溻溽滁滂滃滆滇滉滋滍滏滑滓滔滕' + '滗滘滚滞滟滠满滢滤滥滦滧滨滩滪滫滴滹漂漆漈漉漋漏漓演漕漖漠漤漦漩漪漫漭漯漱漳漴漶' + '漷漹漻漼漾潆潇潋潍潏潖潘潜潞潟潢潦潩潭潮潲潴潵潸潺潼潽潾澂澄澈澉澌澍澎澛澜澡澥澧' + '澪澭澳澴澶澹澼澽激濂濉濋濑濒濞濠濡濩濮濯瀌瀍瀑瀔瀚瀛瀣瀱瀵瀹瀼灈灌灏灞火灭灯灰灵' + '灶灸灼灾灿炀炅炆炉炊炌炎炒炔炕炖炘炙炜炝炟炣炫炬炭炮炯炱炳炷炸点炻炼炽烀烁烂烃烈' + '烊烔烘烙烛烜烝烟烠烤烦烧烨烩烫烬热烯烶烷烹烺烻烽焆焉焊焌焐焓焕焖焗焘焙焚焜焞焦焯' + '焰焱然煁煃煅煊煋煌煎煓煜煞煟煤煦照煨煮煲煳煴煸煺煽熄熇熊熏熔熘熙熛熜熟熠熥熨熬熵' + '熹熻燃燊燋燎燏燔燕燚燠燥燧燮燹爆爇爔爚爝爟爨爪爬爰爱爵父爷爸爹爻爽爿牁牂片版牌牍' + '牒牖牙牚牛牝牟牡牢牤牥牦牧物牮牯牲牵特牺牻牾牿犀犁犄犇犊犋犍犏犒犟犨犬犯犰犴状犷' + '犸犹狁狂狃狄狈狉狍狎狐狒狗狙狝狞狠狡狨狩独狭狮狯狰狱狲狳狴狷狸狺狻狼猁猃猄猇猊猎' + '猕猖猗猛猜猝猞猡猢猥猩猪猫猬献猯猰猱猴猷猹猺猾猿獍獐獒獗獠獬獭獯獴獾玃玄率玉王玎' + '玑玒玓玕玖玘玙玚玛玞玟玠玡玢玤玥玦玩玫玭玮环现玱玲玳玶玷玹玺玻玼玿珀珂珅珇珈珉珊' + '珋珌珍珏珐珑珒珕珖珙珛珝珞珠珢珣珥珦珧珩珪珫班珰珲珵珷珸珹珺珽琀球琄琅理琇琈琉琊' + '琎琏琐琔琚琛琟琡琢琤琥琦琨琪琫琬琭琮琯琰琲琳琴琵琶琼瑀瑁瑂瑃瑄瑅瑆瑑瑓瑔瑕瑖瑗瑙' + '瑚瑛瑜瑝瑞瑟瑢瑧瑨瑬瑭瑰瑱瑳瑶瑷瑾璀璁璃璆璇璈璋璎璐璒璘璜璞璟璠璥璧璨璩璪璬璮璱' + '璲璺瓀瓒瓖瓘瓜瓞瓠瓢瓣瓤瓦瓮瓯瓴瓶瓷瓻瓿甄甍甏甑甓甗甘甚甜生甡甥甦用甩甪甫甬甭甯' + '田由甲申电男甸町画甾畀畅畈畋界畎畏畔畖留畚畛畜畤略畦番畬畯畲畴畸畹畿疁疃疆疍疏疐' + '疑疔疖疗疙疚疝疟疠疡疢疣疤疥疫疬疭疮疯疰疱疲疳疴疵疸疹疼疽疾痂痃痄病症痈痉痊痍痒' + '痓痔痕痘痛痞痢痣痤痦痧痨痪痫痰痱痴痹痼痿瘀瘁瘃瘅瘆瘊瘌瘐瘕瘗瘘瘙瘛瘟瘠瘢瘤瘥瘦瘩' + '瘪瘫瘭瘰瘳瘴瘵瘸瘼瘾瘿癀癃癌癍癔癖癗癜癞癣癫癯癸登白百癿皂的皆皇皈皋皎皑皓皕皖皙' + '皛皞皤皦皭皮皱皲皴皿盂盅盆盈盉益盍盎盏盐监盒盔盖盗盘盛盟盥盦目盯盱盲直盷相盹盼盾' + '省眄眇眈眉眊看眍眙眚真眠眢眦眨眩眬眭眯眵眶眷眸眺眼着睁睃睄睇睎睐睑睚睛睡睢督睥睦' + '睨睫睬睹睽睾睿瞀瞄瞅瞋瞌瞍瞎瞑瞒瞟瞠瞢瞥瞧瞩瞪瞫瞬瞭瞰瞳瞵瞻瞽瞿矍矗矛矜矞矢矣知' + '矧矩矫矬短矮矰石矶矸矻矼矾矿砀码砂砄砆砉砌砍砑砒研砖砗砘砚砜砝砟砠砣砥砧砫砬砭砮' + '砰破砵砷砸砹砺砻砼砾础硁硅硇硊硌硍硎硐硒硔硕硖硗硙硚硝硪硫硬硭确硼硿碃碇碈碉碌碍' + '碎碏碑碓碗碘碚碛碜碟碡碣碥碧碨碰碱碲碳碴碶碹碾磁磅磉磊磋磏磐磔磕磙磜磡磨磬磲磴磷' + '磹磻礁礅礌礓礞礴礵示礼社祀祁祃祆祇祈祉祊祋祎祏祐祓祕祖祗祚祛祜祝神祟祠祢祥祧票祭' + '祯祲祷祸祺祼祾禀禁禄禅禊禋福禒禔禘禚禛禤禧禳禹禺离禽禾秀私秃秆秉秋种科秒秕秘租秣' + '秤秦秧秩秫秬秭积称秸移秽秾稀稂稃稆程稌稍税稑稔稗稙稚稞稠稣稳稷稹稻稼稽稿穄穆穑穗' + '穙穜穟穰穴究穷穸穹空穿窀突窃窄窅窈窊窍窎窑窒窕窖窗窘窜窝窟窠窣窥窦窨窬窭窳窸窿立' + '竑竖竘站竞竟章竣童竦竫竭端竹竺竽竿笃笄笆笈笊笋笏笑笔笕笙笛笞笠笤笥符笨笪笫第笮笯' + '笱笳笸笺笼笾筀筅筇等筋筌筏筐筑筒答策筘筚筛筜筝筠筢筤筥筦筮筱筲筵筶筷筹筻筼签简箅' + '箍箐箓箔箕箖算箜管箢箦箧箨箩箪箫箬箭箱箴箸篁篆篇篌篑篓篙篚篝篡篥篦篪篮篯篱篷篼篾' + '簃簇簉簋簌簏簕簖簝簟簠簧簪簰簸簿籀籁籍籥米籴类籼籽粉粑粒粕粗粘粜粝粞粟粢粤粥粪粮' + '粱粲粳粹粼粽精粿糁糅糇糈糊糌糍糒糕糖糗糙糜糟糠糨糯糵系紊素索紧紫累絜絮絷綦綮縠縢' + '縻繁繄繇纂纛纠纡红纣纤纥约级纨纩纪纫纬纭纮纯纰纱纲纳纴纵纶纷纸纹纺纻纼纽纾线绀绁' + '绂练组绅细织终绉绊绋绌绍绎经绐绑绒结绔绕绖绗绘给绚绛络绝绞统绠绡绢绣绤绥绦继绨绩' + '绪绫续绮绯绰绱绲绳维绵绶绷绸绹绺绻综绽绾绿缀缁缂缃缄缅缆缇缈缉缊缌缎缐缑缒缓缔缕' + '编缗缘缙缚缛缜缝缞缟缠缡缢缣缤缥缦缧缨缩缪缫缬缭缮缯缰缱缲缳缴缵缶缸缺罂罄罅罍罐' + '网罔罕罗罘罚罟罡罢罨罩罪置罱署罴罶罹罽罾羁羊羌美羑羓羔羕羖羚羝羞羟羡群羧羯羰羱羲' + '羸羹羼羽羿翀翁翂翃翅翈翊翌翎翔翕翘翙翚翛翟翠翡翥翦翩翮翯翰翱翳翷翻翼翾耀老考耄者' + '耆耇耋而耍耏耐耑耒耔耕耖耗耘耙耜耠耢耤耥耦耧耨耩耪耰耱耳耵耶耷耸耻耽耿聂聃聆聊聋' + '职聍聒联聘聚聩聪聱聿肃肄肆肇肉肋肌肓肖肘肚肛肝肟肠股肢肤肥肩肪肫肭肮肯肱育肴肷肸' + '肺肼肽肾肿胀胁胂胃胄胆胈背胍胎胖胗胙胚胛胜胝胞胠胡胣胤胥胧胨胩胪胫胬胭胯胰胱胲胳' + '胴胶胸胺胼能脂脆脉脊脍脎脏脐脑脒脓脔脖脘脚脞脟脩脬脯脱脲脶脸脾脿腆腈腊腋腌腐腑腒' + '腓腔腕腘腙腚腠腥腧腨腩腭腮腯腰腱腴腹腺腻腼腽腾腿膀膂膈膊膏膑膘膙膛膜膝膦膨膳膺膻' + '臀臂臃臆臊臌臑臜臣臧自臬臭至致臻臼臾舀舁舂舄舅舆舌舍舐舒舔舛舜舞舟舠舢舣舥航舫般' + '舭舯舰舱舲舳舴舵舶舷舸船舻舾艄艅艇艉艋艎艏艘艚艟艨艮良艰色艳艴艺艽艾艿节芃芄芈芊' + '芋芍芎芏芑芒芗芘芙芜芝芟芠芡芣芤芥芦芨芩芪芫芬芭芮芯芰花芳芴芷芸芹芼芽芾苁苄苇苈' + '苉苊苋苌苍苎苏苑苒苓苔苕苗苘苛苜苞苟苠苡苣苤若苦苧苫苯英苴苷苹苻苾茀茁茂范茄茅茆' + '茈茉茋茌茎茏茑茓茔茕茗茚茛茜茝茧茨茫茬茭茯茱茳茴茵茶茸茹茺茼茽荀荁荃荄荆荇草荏荐' + '荑荒荓荔荖荙荚荛荜荞荟荠荡荣荤荥荦荧荨荩荪荫荬荭荮药荷荸荻荼荽莅莆莉莎莒莓莘莙莛' + '莜莝莞莠莨莩莪莫莰莱莲莳莴莶获莸莹莺莼莽莿菀菁菂菅菇菉菊菌菍菏菔菖菘菜菝菟菠菡菥' + '菩菪菰菱菲菹菼菽萁萃萄萆萋萌萍萎萏萑萘萚萜萝萣萤营萦萧萨萩萱萳萸萹萼落葆葎葑葖著' + '葙葚葛葜葡董葩葫葬葭葰葱葳葴葵葶葸葺蒂蒄蒇蒈蒉蒋蒌蒎蒐蒗蒙蒜蒟蒡蒨蒯蒱蒲蒴蒸蒹蒺' + '蒻蒽蒿蓁蓂蓄蓇蓉蓊蓍蓏蓐蓑蓓蓖蓝蓟蓠蓢蓣蓥蓦蓬蓰蓼蓿蔀蔃蔈蔊蔌蔑蔓蔗蔚蔟蔡蔫蔬蔷' + '蔸蔹蔺蔻蔼蔽蕃蕈蕉蕊蕖蕗蕙蕞蕤蕨蕰蕲蕴蕹蕺蕻蕾薁薄薅薇薏薛薜薢薤薨薪薮薯薰薳薷薸' + '薹薿藁藉藏藐藓藕藜藟藠藤藦藨藩藻藿蘅蘑蘖蘘蘧蘩蘸蘼虎虏虐虑虒虓虔虚虞虢虤虫虬虮虱' + '虷虸虹虺虻虼虽虾虿蚀蚁蚂蚄蚆蚊蚋蚌蚍蚓蚕蚜蚝蚣蚤蚧蚨蚩蚪蚬蚯蚰蚱蚲蚴蚶蚺蛀蛃蛄蛆' + '蛇蛉蛊蛋蛎蛏蛐蛑蛔蛘蛙蛛蛞蛟蛤蛩蛭蛮蛰蛱蛲蛳蛴蛸蛹蛾蜀蜂蜃蜇蜈蜉蜊蜍蜎蜐蜒蜓蜕蜗' + '蜘蜚蜜蜞蜡蜢蜣蜥蜩蜮蜱蜴蜷蜻蜾蜿蝇蝈蝉蝌蝎蝓蝗蝘蝙蝠蝣蝤蝥蝮蝰蝲蝴蝶蝻蝼蝽蝾螂螃' + '螅螈螋融螗螟螠螣螨螫螬螭螯螱螳螵螺螽蟀蟆蟊蟋蟏蟑蟒蟛蟠蟥蟪蟫蟮蟹蟾蠃蠊蠋蠓蠕蠖蠡' + '蠢蠲蠹蠼血衃衄衅行衍衎衒衔街衙衠衡衢衣补表衩衫衬衮衰衲衷衽衾衿袁袂袄袅袆袈袋袍袒' + '袖袗袜袢袤袪被袭袯袱袷袼裁裂装裆裈裉裎裒裔裕裘裙裛裟裢裣裤裥裨裰裱裳裴裸裹裼裾褂' + '褊褐褒褓褕褙褚褛褟褡褥褪褫褯褰褴褶襁襄襕襚襜襞襟襦襫襻西要覃覆见观觃规觅视觇览觉' + '觊觋觌觎觏觐觑角觖觚觜觞觟解觥触觫觭觯觱觳觿言訄訇訚訾詈詟詹誉誊誓謇警譬计订讣认' + '讥讦讧讨让讪讫训议讯记讱讲讳讴讵讶讷许讹论讻讼讽设访诀证诂诃评诅识诇诈诉诊诋诌词' + '诎诏诐译诒诓诔试诖诗诘诙诚诛诜话诞诟诠诡询诣诤该详诧诨诩诫诬语诮误诰诱诲诳说诵请' + '诸诹诺读诼诽课诿谀谁谂调谄谅谆谇谈谊谋谌谍谎谏谐谑谒谓谔谕谖谗谙谚谛谜谝谞谟谠谡' + '谢谣谤谥谦谧谨谩谪谫谬谭谮谯谰谱谲谳谴谵谶谷谼谿豁豆豇豉豌豕豚象豢豨豪豫豮豳豸豹' + '豺貂貅貆貉貊貌貔貘贝贞负贡财责贤败账货质贩贪贫贬购贮贯贰贱贲贳贴贵贶贷贸费贺贻贼' + '贽贾贿赀赁赂赃资赅赆赇赈赉赊赋赌赍赎赏赐赑赒赓赔赕赖赗赘赙赚赛赜赝赞赟赠赡赢赣赤' + '赦赧赪赫赭走赳赴赵赶起趁趄超越趋趑趔趟趣趯趱足趴趵趸趺趼趾趿跂跃跄跆跋跌跎跏跐跑' + '跖跗跚跛距跞跟跣跤跨跪跬路跱跳践跶跷跸跹跺跻跽踅踉踊踌踏踒踔踝踞踟踢踣踦踩踪踬踮' + '踯踱踵踶踹踺踽蹀蹁蹂蹄蹅蹇蹈蹉蹊蹋蹐蹑蹒蹙蹚蹜蹢蹦蹩蹬蹭蹯蹰蹲蹴蹶蹼蹽蹾蹿躁躅躇' + '躏躐躔躜躞身躬躯躲躺车轧轨轩轪轫转轭轮软轰轱轲轳轴轵轶轷轸轹轺轻轼载轾轿辀辁辂较' + '辄辅辆辇辈辉辊辋辌辍辎辏辐辑辒输辔辕辖辗辘辙辚辛辜辞辟辣辨辩辫辰辱边辽达辿迁迂迄' + '迅过迈迎运近迓返迕还这进远违连迟迢迤迥迦迨迩迪迫迭迮述迳迷迸迹迺追退送适逃逄逅逆' + '选逊逋逍透逐逑递途逖逗通逛逝逞速造逡逢逦逭逮逯逴逵逶逸逻逼逾遁遂遄遆遇遍遏遐遑遒' + '道遗遘遛遢遣遥遨遭遮遴遵遹遽避邀邂邃邈邋邑邓邕邗邘邙邛邝邠邡邢那邦邨邪邬邮邯邰邱' + '邲邳邴邵邶邸邹邺邻邽邾邿郁郃郄郅郇郈郊郎郏郐郑郓郗郚郛郜郝郡郢郤郦郧部郪郫郭郯郴' + '郸都郾郿鄀鄂鄃鄄鄅鄌鄑鄗鄘鄙鄚鄜鄞鄠鄢鄣鄫鄯鄱鄹酂酃酅酆酉酊酋酌配酎酏酐酒酗酚酝' + '酞酡酢酣酤酥酦酩酪酬酮酯酰酱酲酴酵酶酷酸酹酺酽酾酿醅醇醉醋醌醍醐醑醒醚醛醢醨醪醭' + '醮醯醴醵醺醾采釉释里重野量釐金釜鉴銎銮鋆鋈錾鍪鎏鏊鏖鐾鑫钆钇针钉钊钋钌钍钎钏钐钒' + '钓钔钕钖钗钘钙钚钛钜钝钞钟钠钡钢钣钤钥钦钧钨钩钪钫钬钭钮钯钰钱钲钳钴钵钷钹钺钻钼' + '钽钾钿铀铁铂铃铄铅铆铈铉铊铋铌铍铎铏铐铑铒铕铖铗铘铙铚铛铜铝铞铟铠铡铢铣铤铥铧铨' + '铩铪铫铬铭铮铯铰铱铲铳铴铵银铷铸铹铺铻铼铽链铿销锁锂锃锄锅锆锇锈锉锊锋锌锍锎锏锐' + '锑锒锓锔锕锖锗锘错锚锛锜锝锞锟锡锢锣锤锥锦锧锨锩锪锫锬锭键锯锰锱锲锳锴锵锶锷锸锹' + '锺锻锼锽锾锿镀镁镂镃镄镅镆镇镈镉镊镋镌镍镎镏镐镑镒镓镔镕镖镗镘镚镛镜镝镞镠镡镢镣' + '镤镥镦镧镨镩镪镫镬镭镮镯镰镱镲镳镴镵镶长门闩闪闫闭问闯闰闱闲闳间闵闶闷闸闹闺闻闼' + '闽闾闿阀阁阂阃阄阅阆阇阈阉阊阋阌阍阎阏阐阑阒阔阕阖阗阘阙阚阜队阡阪阮阱防阳阴阵阶' + '阻阼阽阿陀陂附际陆陇陈陉陋陌降陎限陑陔陕陛陞陟陡院除陧陨险陪陬陲陴陵陶陷隃隅隆隈' + '隋隍随隐隔隗隘隙障隧隩隰隳隶隹隺隼隽难雀雁雄雅集雇雉雊雌雍雎雏雒雕雠雨雩雪雯雱雳' + '零雷雹雾需霁霄霅霆震霈霉霍霎霏霓霖霜霞霨霪霭霰露霸霹霾青靓靖静靛非靠靡面靥革靬靰' + '靳靴靶靸靺靼靽靿鞁鞅鞋鞍鞑鞒鞔鞘鞠鞡鞣鞧鞨鞫鞬鞭鞮鞯鞲鞳鞴韂韦韧韨韩韪韫韬韭音韵' + '韶页顶顷顸项顺须顼顽顾顿颀颁颂颃预颅领颇颈颉颊颋颌颍颎颏颐频颓颔颖颗题颙颚颛颜额' + '颞颟颠颡颢颤颥颦颧风飏飐飑飒飓飔飕飗飘飙飞食飧飨餍餐餮饔饕饥饧饨饩饪饫饬饭饮饯饰' + '饱饲饳饴饵饶饷饸饹饺饻饼饽饿馁馃馄馅馆馇馈馉馊馋馌馍馏馐馑馒馓馔馕首馗馘香馝馞馥' + '馧馨马驭驮驯驰驱驲驳驴驵驶驷驸驹驺驻驼驽驾驿骀骁骂骃骄骅骆骇骈骉骊骋验骍骎骏骐骑' + '骒骓骕骖骗骘骙骚骛骜骝骞骟骠骡骢骣骤骥骦骧骨骰骱骶骷骸骺骼髀髁髂髃髅髋髌髎髑髓高' + '髡髢髦髫髭髯髹髻髽鬃鬈鬏鬒鬓鬘鬟鬣鬯鬲鬶鬷鬻鬼魁魂魃魄魅魆魇魈魉魋魍魏魑魔鱼鱽鱾' + '鱿鲀鲁鲂鲃鲅鲆鲇鲈鲉鲊鲋鲌鲍鲎鲏鲐鲑鲒鲔鲕鲖鲗鲘鲙鲚鲛鲜鲝鲞鲟鲠鲡鲢鲣鲤鲥鲦鲧鲨' + '鲩鲪鲫鲬鲭鲮鲯鲰鲱鲲鲳鲴鲵鲷鲸鲹鲺鲻鲼鲽鲾鲿鳀鳁鳂鳃鳄鳅鳇鳈鳉鳊鳌鳍鳎鳏鳐鳑鳒鳓' + '鳔鳕鳖鳗鳘鳙鳚鳛鳜鳝鳞鳟鳠鳡鳢鳣鳤鸟鸠鸡鸢鸣鸤鸥鸦鸧鸨鸩鸪鸫鸬鸭鸮鸯鸰鸱鸲鸳鸵鸶' + '鸷鸸鸹鸺鸻鸼鸽鸾鸿鹀鹁鹂鹃鹄鹅鹆鹇鹈鹉鹊鹋鹌鹍鹎鹏鹐鹑鹒鹔鹕鹖鹗鹘鹙鹚鹛鹜鹝鹞鹟' + '鹠鹡鹢鹣鹤鹦鹧鹨鹩鹪鹫鹬鹭鹮鹯鹰鹱鹲鹳鹴鹾鹿麀麂麇麈麋麑麒麓麖麝麟麦麸麹麻麽麾黄' + '黇黉黍黎黏黑黔默黛黜黝黟黠黡黢黥黧黩黪黯黹黻黼黾鼋鼍鼎鼐鼒鼓鼗鼙鼠鼢鼩鼫鼬鼯鼱鼷' + '鼹鼻鼽鼾齁齇齉齐齑齿龀龁龂龃龄龅龆龇龈龉龊龋龌龙龚龛龟龠龢鿍鿎鿏㑇㑊㕮㘎㙍㙘㙦㛃' + '㛚㛹㟃㠇㠓㤘㥄㧐㧑㧟㫰㬊㬎㬚㭎㭕㮾㰀㳇㳘㳚㴔㵐㶲㸆㸌㺄㻬㽏㿠䁖䂮䃅䃎䅟䌹䎃䎖䏝䏡' + '䏲䐃䓖䓛䓨䓫䓬䗖䗛䗪䗴䜣䝙䢺䢼䣘䥽䦃䲟䲠䲢䴓䴔䴕䴖䴗䴘䴙䶮𠅤𠙶𠳐𡎚𡐓𣗋𣲗𣲘𣸣𤧛𤩽' + '𤫉𥔲𥕢𥖨𥻗𦈡𦒍𦙶𦝼𦭜𦰡𧿹𨐈𨙸𨚕𨟠𨭉𨱇𨱏𨱑𨱔𨺙𩽾𩾃𩾌𪟝𪣻𪤗𪨰𪨶𪩘𪾢𫄧𫄨𫄷𫄸𫇭𫌀𫍣𫍯' + '𫍲𫍽𫐄𫐐𫐓𫑡𫓧𫓯𫓶𫓹𫔍𫔎𫔶𫖮𫖯𫖳𫗧𫗴𫘜𫘝𫘦𫘧𫘨𫘪𫘬𫚕𫚖𫚭𫛭𫞩𫟅𫟦𫟹𫟼𫠆𫠊𫠜𫢸𫫇𫭟' + '𫭢𫭼𫮃𫰛𫵷𫶇𫷷𫸩𬀩𬀪𬂩𬃊𬇕𬇙𬇹𬉼𬊈𬊤𬌗𬍛𬍡𬍤𬒈𬒔𬒗𬕂𬘓𬘘𬘡𬘩𬘫𬘬𬘭𬘯𬙂𬙊𬙋𬜬𬜯𬞟' + '𬟁𬟽𬣙𬣞𬣡𬣳𬤇𬤊𬤝𬨂𬨎𬩽𬪩𬬩𬬭𬬮𬬱𬬸𬬹𬬻𬬿𬭁𬭊𬭎𬭚𬭛𬭤𬭩𬭬𬭯𬭳𬭶𬭸𬭼𬮱𬮿𬯀𬯎𬱖𬱟' + '𬳵𬳶𬳽𬳿𬴂𬴃𬴊𬶋𬶍𬶏𬶐𬶟𬶠𬶨𬶭𬶮𬷕𬸘𬸚𬸣𬸦𬸪𬹼𬺈𬺓') + +VALID_CHARS = CN_CHARS + EN_CHARS + DIGIT_CHARS + ' ' +VALID_CHARS_MAP = {c: True for c in VALID_CHARS} # ================================================================================ # # basic class # ================================================================================ # -class ChineseChar(): +class ChineseChar(object): """ 中文字符 每个字符对应简体和繁体, @@ -67,6 +391,7 @@ class ChineseChar(): def __init__(self, simplified, traditional): self.simplified = simplified self.traditional = traditional + #self.__repr__ = self.__str__ def __str__(self): return self.simplified or self.traditional or None @@ -83,7 +408,7 @@ class ChineseNumberUnit(ChineseChar): """ def __init__(self, power, simplified, traditional, big_s, big_t): - super().__init__(simplified, traditional) + super(ChineseNumberUnit, self).__init__(simplified, traditional) self.power = power self.big_s = big_s self.big_t = big_t @@ -144,7 +469,7 @@ class ChineseNumberDigit(ChineseChar): big_t, alt_s=None, alt_t=None): - super().__init__(simplified, traditional) + super(ChineseNumberDigit, self).__init__(simplified, traditional) self.value = value self.big_s = big_s self.big_t = big_t @@ -165,7 +490,7 @@ class ChineseMath(ChineseChar): """ def __init__(self, simplified, traditional, symbol, expression=None): - super().__init__(simplified, traditional) + super(ChineseMath, self).__init__(simplified, traditional) self.symbol = symbol self.expression = expression self.big_s = simplified @@ -175,14 +500,14 @@ class ChineseMath(ChineseChar): CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath -class NumberSystem(): +class NumberSystem(object): """ 中文数字系统 """ pass -class MathSymbol(): +class MathSymbol(object): """ 用于中文数字系统的数学符号 (繁/简体), e.g. positive = ['正', '正'] @@ -200,7 +525,7 @@ class MathSymbol(): yield v -# class OtherSymbol(): +# class OtherSymbol(object): # """ # 其他符号 # """ @@ -366,17 +691,20 @@ def num2chn(number_string, use_zeros=True, use_units=True): def get_value(value_string, use_zeros=True): + striped_string = value_string.lstrip('0') # record nothing if all zeros if not striped_string: return [] + # record one digits elif len(striped_string) == 1: if use_zeros and len(value_string) != len(striped_string): return [system.digits[0], system.digits[int(striped_string)]] else: return [system.digits[int(striped_string)]] + # recursively record multiple digits else: result_unit = next( @@ -403,7 +731,6 @@ def num2chn(number_string, result_symbols = get_value(int_string) else: result_symbols = [system.digits[int(c)] for c in int_string] - dec_symbols = [system.digits[int(c)] for c in dec_string] if dec_string: result_symbols += [system.math.point] + dec_symbols @@ -418,13 +745,12 @@ def num2chn(number_string, previous_symbol = result_symbols[i - 1] if i > 0 else None if isinstance(next_symbol, CNU) and isinstance( previous_symbol, (CNU, type(None))): - # yapf: disable - if next_symbol.power != 1 and ((previous_symbol is None) or - (previous_symbol.power != 1)): + if next_symbol.power != 1 and ( + (previous_symbol is None) or + (previous_symbol.power != 1)): result_symbols[i] = liang - # yapf: enable - # if big is True, '两' will not be used and `alt_two` has no impact on output + # if big is True, '两' will not be used and `alt_two` has no impact on output if big: attr_name = 'big_' if traditional: @@ -516,6 +842,7 @@ class TelePhone: # return self.telephone def telephone2chntext(self, fixed=False): + if fixed: sil_parts = self.telephone.split('-') self.raw_chntext = ''.join([ @@ -592,7 +919,6 @@ class Date: except ValueError: other = date year = '' - if other: try: month, day = other.strip().split('月', 1) @@ -600,13 +926,11 @@ class Date: except ValueError: day = date month = '' - if day: day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1] else: month = '' day = '' - chntext = year + month + day self.chntext = chntext return self.chntext @@ -782,6 +1106,52 @@ class NSWNormalizer: return self.norm_text.lstrip('^').rstrip('$') +# ================================================================================ # +# misc normalization functions +# ================================================================================ # +def remove_erhua(text): + """ + 去除儿化音词中的儿: + 他女儿在那边儿 -> 他女儿在那边 + """ + + new_str = '' + while re.search('儿', text): + a = re.search('儿', text).span() + remove_er_flag = 0 + + if ER_WHITELIST_PATTERN.search(text): + b = ER_WHITELIST_PATTERN.search(text).span() + if b[0] <= a[0]: + remove_er_flag = 1 + + if remove_er_flag == 0: + new_str = new_str + text[0:a[0]] + text = text[a[1]:] + else: + new_str = new_str + text[0:b[1]] + text = text[b[1]:] + + text = new_str + text + return text + + +def check_chars(text): + for c in text: + if not VALID_CHARS_MAP.get(c): + return c + return '' + + +def quanjiao2banjiao(text): + return text.translate(QJ2BJ_transform) + + +# ================================================================================ # +# testing +# ================================================================================ # + + def nsw_test_case(raw_text): print('I:' + raw_text) print('O:' + NSWNormalizer(raw_text).normalize()) @@ -806,89 +1176,234 @@ def nsw_test(): nsw_test_case('有62%的概率') +###################################################################################### + + +## Normalize unicode characters +def remove_weird_chars(text): + # ``` + # (NFKD) will apply the compatibility decomposition, i.e. + # replace all compatibility characters with their equivalents. + # ``` + text = unicodedata.normalize('NFKD', text).encode('utf-8', 'ignore').decode( + 'utf-8', 'ignore') + return text + + +## Remove extra linebreaks +def remove_extra_linebreaks(text): + lines = text.split(r'\n+') + return '\n'.join( + [re.sub(r'[\s]+', ' ', l).strip() for l in lines if len(l) != 0]) + + +## Remove extra medial/trailing/leading spaces +def remove_extra_spaces(text): + return re.sub("\\s+", " ", text).strip() + + +## Seg the text into words +def seg(text): + text_seg = jieba.cut(text) + out = ' '.join(text_seg) + return out + + +## Remove punctuation/symbols +def remove_symbols(text): + """ + + Unicode 6.0 has 7 character categories, and each category has subcategories: + + Letter (L): lowercase (Ll), modifier (Lm), titlecase (Lt), uppercase (Lu), other (Lo) + Mark (M): spacing combining (Mc), enclosing (Me), non-spacing (Mn) + Number (N): decimal digit (Nd), letter (Nl), other (No) + Punctuation (P): connector (Pc), dash (Pd), initial quote (Pi), final quote (Pf), open (Ps), close (Pe), other (Po) + Symbol (S): currency (Sc), modifier (Sk), math (Sm), other (So) + Separator (Z): line (Zl), paragraph (Zp), space (Zs) + Other (C): control (Cc), format (Cf), not assigned (Cn), private use (Co), surrogate (Cs) + + + There are 3 ranges reserved for private use (Co subcategory): + U+E000—U+F8FF (6,400 code points), U+F0000—U+FFFFD (65,534) and U+100000—U+10FFFD (65,534). + Surrogates (Cs subcategory) use the range U+D800—U+DFFF (2,048 code points). + + + """ + ## Brute-force version: list all possible unicode ranges, but this list is not complete. + # text = re.sub('[\u0021-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e\u00a1-\u00bf\u2000-\u206f\u2013-\u204a\u20a0-\u20bf\u2100-\u214f\u2150-\u218b\u2190-\u21ff\u2200-\u22ff\u2300-\u23ff\u2460-\u24ff\u2500-\u257f\u2580-\u259f\u25a0-\u25ff\u2600-\u26ff\u2e00-\u2e7f\u3000-\u303f\ufe50-\ufe6f\ufe30-\ufe4f\ufe10-\ufe1f\uff00-\uffef─◆╱]+','',text) + + text = ''.join( + ch for ch in text if unicodedata.category(ch)[0] not in ['P', 'S']) + return text + + +## Remove numbers +def remove_numbers(text): + return re.sub('\\d+', "", text) + + +## Remove alphabets +def remove_alphabets(text): + return re.sub('[a-zA-Z]+', '', text) + + +## Combine every step +def normalize_corpus(corpus, + is_remove_extra_linebreaks=True, + is_remove_weird_chars=True, + is_seg=True, + is_remove_symbols=True, + is_remove_numbers=True, + is_remove_alphabets=True): + + normalized_corpus = [] + # normalize each document in the corpus + for doc in corpus: + + if is_remove_extra_linebreaks: + doc = remove_extra_linebreaks(doc) + + if is_remove_weird_chars: + doc = remove_weird_chars(doc) + + if is_seg: + doc = seg(doc) + + if is_remove_symbols: + doc = remove_symbols(doc) + + if is_remove_alphabets: + doc = remove_alphabets(doc) + + if is_remove_numbers: + doc = remove_numbers(doc) + + normalized_corpus.append(remove_extra_spaces(doc)) + + return normalized_corpus + + +###################################################################################### + + def char_token(s: Text) -> List[Text]: """chinese charactor - Args: - s (Text): [description] + s (Text): "我爱中国“ Returns: - List[Text]: [description] + List[Text]: ['我', '爱', '中', '国'] """ return list(s) def word_token(s: Text) -> List[Text]: """chinese word - Args: - s (Text): [description] + s (Text): "我爱中国“ Returns: - List[Text]: [description] + List[Text]: ['我', '爱', '中国'] """ return jieba.lcut(s) -def text_process(s: Text) -> Text: +def find_chinese(file): + pattern = re.compile(r'[^\u4e00-\u9fa5]') + chinese = re.sub(pattern, '', file) + return chinese + + +def text_process(text: Text, args) -> Text: """do chinese text normaliztion + 1. remove * + 2. NWS + 3. remove puncuation + 4. remove english Args: - s (Text): [description] + text (Text): [description] Returns: Text: [description] """ - s = s.replace('*', '') + # strip + text = text.strip() + text = remove_extra_linebreaks(text) + text = remove_weird_chars(text) + text = remove_extra_spaces(text) + + # quanjiao -> banjiao + if args.to_banjiao: + text = quanjiao2banjiao(text) + + # Unify upper/lower cases + if args.to_upper: + text = text.upper() + if args.to_lower: + text = text.lower() + + # Remove filler chars + if args.remove_fillers: + for c in FILLER_CHARS: + text = text.replace(c, '') + + if args.remove_erhua: + text = remove_erhua(text) + + text = text.replace('*', '') + # NSW(Non-Standard-Word) normalization - s = NSWNormalizer(s).normalize() + text = NSWNormalizer(text).normalize() + if len(text) == 0: + exit(-1) + # Punctuations removal - s = re.sub(f'[{hanzi.punctuation}{string.punctuation}]', "", s) + text = re.sub(f'[{hanzi.punctuation}{string.punctuation}]', "", text) + + # Remove punctuations + old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations + new_chars = ' ' * len(old_chars) + del_chars = '' + text = text.translate(str.maketrans(old_chars, new_chars, del_chars)) + # rm english - s = ''.join(re.findall(hanzi.sent, s)) - return s + text = find_chinese(text) + + # Remove space + if args.remove_space: + text = text.replace(' ', '') + + return text def main(infile, outfile, args): # tokenizer token_type = args.token_type - if token_type == 'char': + if token_type.lower() == 'char': tokenizer = char_token - elif token_type == 'word': + elif token_type.lower() == 'word': tokenizer = word_token else: tokenizer = None with open(infile, 'rt') as fin, open(outfile, 'wt') as fout: - lines = fin.readlines() - n = 0 - for l in lines: - key = '' - text = '' + ndone = 0 + for line in fin: + line = line.strip() + key, text = '', '' if args.has_key: - cols = l.split(maxsplit=1) + cols = line.split(maxsplit=1) key = cols[0] - if len(cols) == 2: - text = cols[1] - else: - text = '' + text = cols[1] if len(cols) == 2 else '' else: - text = l - - # strip - text = text.strip() - # cases - if args.to_upper and args.to_lower: - sys.stderr.write('to_upper OR to_lower?') - exit(1) - if args.to_upper: - text = text.upper() - if args.to_lower: - text = text.lower() - - # Normalization - text = text_process(text) + text = line + + text = text_process(text, args) + + # word segment: chinese char/word if tokenizer: text = ' '.join(tokenizer(text)) @@ -899,29 +1414,56 @@ def main(infile, outfile, args): ) != '': # skip empty line in pure text format(without Kaldi's utt key) fout.write(text + '\n') - n += 1 - if n % args.log_interval == 0: - print(f"process {n} lines.", file=sys.stderr) + ndone += 1 + if ndone % args.log_interval == 0: + print( + f'text norm: {ndone} lines done.', + file=sys.stderr, + flush=True) + + print( + f'text norm: {ndone} lines done in total.', + file=sys.stderr, + flush=True) if __name__ == '__main__': p = argparse.ArgumentParser() - p.add_argument('token_type', default=None, help='token type. [char|word]') - p.add_argument('ifile', help='input filename, assume utf-8 encoding') - p.add_argument('ofile', help='output filename') - p.add_argument( - '--to_upper', action='store_true', help='convert to upper case') - p.add_argument( - '--to_lower', action='store_true', help='convert to lower case') + p.add_argument('--token_type', default=None, help='token type. [char|word]') p.add_argument( '--has_key', - action='store_true', + default=False, help="input text has Kaldi's key as first field.") p.add_argument( '--log_interval', type=int, - default=100000, + default=10000, help='log interval in number of processed lines') - args = p.parse_args() + p.add_argument( + '--to_banjiao', + action='store_true', + help='convert quanjiao chars to banjiao') + p.add_argument( + '--to_upper', action='store_true', help='convert to upper case') + p.add_argument( + '--to_lower', action='store_true', help='convert to lower case') + p.add_argument( + '--remove_fillers', + action='store_true', + help='remove filler chars such as "呃, 啊"') + p.add_argument( + '--remove_erhua', + action='store_true', + help='remove erhua chars such as "他女儿在那边儿 -> 他女儿在那边"') + p.add_argument( + '--check_chars', + action='store_true', + help='skip sentences containing illegal chars') + p.add_argument( + '--remove_space', action='store_true', help='remove whitespace') + p.add_argument('ifile', help='input filename, assume utf-8 encoding') + p.add_argument('ofile', help='output filename') + args = p.parse_args() + print(args) main(args.ifile, args.ofile, args) From f399ca9d323ef0ba99ef1bd1e955e11975003a00 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 16 Apr 2022 13:36:24 +0000 Subject: [PATCH 10/18] format --- utils/compute-wer.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/utils/compute-wer.py b/utils/compute-wer.py index 978a80c9..2d7cc8e1 100755 --- a/utils/compute-wer.py +++ b/utils/compute-wer.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- # CopyRight WeNet Apache-2.0 License import codecs -import re import sys import unicodedata @@ -33,7 +32,8 @@ def characterize(string): else: # some input looks like: , we want to separate it to two words. sep = ' ' - if char == '<': sep = '>' + if char == '<': + sep = '>' j = i + 1 while j < len(string): c = string[j] @@ -48,7 +48,8 @@ def characterize(string): def stripoff_tags(x): - if not x: return '' + if not x: + return '' chars = [] i = 0 T = len(x) @@ -365,7 +366,7 @@ if __name__ == '__main__': verbose = 0 try: verbose = int(b) - except: + except Exception as e: if b == 'true' or b != '0': verbose = 1 continue @@ -408,7 +409,8 @@ if __name__ == '__main__': array = characterize(line) else: array = line.strip().split() - if len(array) == 0: continue + if len(array) == 0: + continue fid = array[0] rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive, split) @@ -419,7 +421,8 @@ if __name__ == '__main__': array = characterize(line) else: array = line.rstrip('\n').split() - if len(array) == 0: continue + if len(array) == 0: + continue fid = array[0] if fid not in rec_set: continue @@ -526,7 +529,7 @@ if __name__ == '__main__': for line in open(cluster_file, 'r', encoding='utf-8'): for token in line.decode('utf-8').rstrip('\n').split(): # end of cluster reached, like - if token[0:2] == '' and \ + if token[0:2] == '' and \ token.lstrip('') == cluster_id : result = calculator.cluster(cluster) if result['all'] != 0: @@ -541,7 +544,7 @@ if __name__ == '__main__': cluster_id = '' cluster = [] # begin of cluster reached, like - elif token[0] == '<' and token[len(token)-1] == '>' and \ + elif token[0] == '<' and token[len(token) - 1] == '>' and \ cluster_id == '' : cluster_id = token.lstrip('<').rstrip('>') cluster = [] From c492a42f140ee919b418f3958a1a3213613c12c5 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sat, 16 Apr 2022 14:52:34 +0000 Subject: [PATCH 11/18] add build tlg wfst --- speechx/examples/README.md | 12 +- .../ngram/zh/local/text_to_lexicon.py | 37 ++++ speechx/examples/ngram/zh/run.sh | 14 +- speechx/examples/wfst/.gitignore | 1 + speechx/examples/wfst/README.md | 168 ++++++++++++++++++ speechx/examples/wfst/path.sh | 2 +- speechx/examples/wfst/run.sh | 47 +---- speechx/examples/wfst/utils | 1 + 8 files changed, 234 insertions(+), 48 deletions(-) create mode 100755 speechx/examples/ngram/zh/local/text_to_lexicon.py create mode 100644 speechx/examples/wfst/.gitignore mode change 100644 => 100755 speechx/examples/wfst/run.sh create mode 120000 speechx/examples/wfst/utils diff --git a/speechx/examples/README.md b/speechx/examples/README.md index c3de0d3a..50f5f902 100644 --- a/speechx/examples/README.md +++ b/speechx/examples/README.md @@ -22,12 +22,16 @@ pip install netron netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20 ``` +## For Developer + +> Warning: Only for developer, make sure you know what's it. + +* dev - for speechx developer, using for test. + ## Build WFST +> Warning: Using below example when you know what's it. + * text_lm - process text for build lm * ngram - using to build NGram ARPA lm. * wfst - build wfst for TLG. - -## For Developer - -* dev - for speechx developer, using for test. diff --git a/speechx/examples/ngram/zh/local/text_to_lexicon.py b/speechx/examples/ngram/zh/local/text_to_lexicon.py new file mode 100755 index 00000000..4d6b016d --- /dev/null +++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +import argparse + +def main(args): + with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout: + for line in fin: + line = line.strip() + if args.has_key: + utt, text = line.split(maxsplit=1) + words = text.split() + else: + words = line.split() + + for word in words: + val = " ".join(list(word)) + fout.write(f"{word}\t{val}\n") + fout.flush() + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='text(line:utt1 中国 人) to lexicon(line:中国 中 国).') + parser.add_argument( + '--has_key', + default=True, + help='text path, with utt or not') + parser.add_argument( + '--text', + required=True, + help='text path. line: utt1 中国 人 or 中国 人') + parser.add_argument( + '--lexicon', + required=True, + help='lexicon path. line:中国 中 国') + args = parser.parse_args() + print(args) + + main(args) diff --git a/speechx/examples/ngram/zh/run.sh b/speechx/examples/ngram/zh/run.sh index eda422b3..347dfa2d 100755 --- a/speechx/examples/ngram/zh/run.sh +++ b/speechx/examples/ngram/zh/run.sh @@ -42,11 +42,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then # line: char/spm_pices cp $unit data/local/dict/units.txt - # line: word ph0 ... phn -> line: word char0 ... charn - utils/fst/prepare_dict.py \ - --unit_file $unit \ - --in_lexicon ${lexicon} \ - --out_lexicon data/local/dict/lexicon.txt + if [ -f $lexicon ];then + # line: word ph0 ... phn -> line: word char0 ... charn + utils/fst/prepare_dict.py \ + --unit_file $unit \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt + else + local/text_to_lexicon.py --has_key true --text $text --lexicon data/local/dict/lexicon.txt + fi fi lm=data/local/lm diff --git a/speechx/examples/wfst/.gitignore b/speechx/examples/wfst/.gitignore new file mode 100644 index 00000000..1269488f --- /dev/null +++ b/speechx/examples/wfst/.gitignore @@ -0,0 +1 @@ +data diff --git a/speechx/examples/wfst/README.md b/speechx/examples/wfst/README.md index 4f862a25..4f4674a4 100644 --- a/speechx/examples/wfst/README.md +++ b/speechx/examples/wfst/README.md @@ -1,3 +1,146 @@ +# Built TLG wfst + +## Input +``` +data/local/ +├── dict +│ ├── lexicon.txt +│ └── units.txt +└── lm + ├── heldout + ├── lm.arpa + ├── text + ├── text.no_oov + ├── train + ├── unigram.counts + ├── word.counts + └── wordlist +``` + +``` +==> data/local/dict/lexicon.txt <== +啊 啊 +啊啊啊 啊 啊 啊 +阿 阿 +阿尔 阿 尔 +阿根廷 阿 根 廷 +阿九 阿 九 +阿克 阿 克 +阿拉伯数字 阿 拉 伯 数 字 +阿拉法特 阿 拉 法 特 +阿拉木图 阿 拉 木 图 + +==> data/local/dict/units.txt <== + + +A +B +C +D +E +F +G +H + +==> data/local/lm/heldout <== +而 对 楼市 成交 抑制 作用 最 大 的 限 购 +也 成为 地方 政府 的 眼中 钉 +自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 +各地 政府 便 纷纷 跟进 +仅 一 个 多 月 的 时间 里 +除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 +四十六 个 限 购 城市 当中 +四十一 个 已 正式 取消 或 变相 放松 了 限 购 +财政 金融 政策 紧随 其后 而来 +显示 出 了 极 强 的 威力 + +==> data/local/lm/lm.arpa <== + +\data\ +ngram 1=129356 +ngram 2=504661 +ngram 3=123455 + +\1-grams: +-1.531278 +-3.828829 -0.1600094 +-6.157292 + +==> data/local/lm/text <== +BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购 +BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉 +BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 +BAC009S0002W0125 各地 政府 便 纷纷 跟进 +BAC009S0002W0126 仅 一 个 多 月 的 时间 里 +BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 +BAC009S0002W0128 四十六 个 限 购 城市 当中 +BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购 +BAC009S0002W0130 财政 金融 政策 紧随 其后 而来 +BAC009S0002W0131 显示 出 了 极 强 的 威力 + +==> data/local/lm/text.no_oov <== + 而 对 楼市 成交 抑制 作用 最 大 的 限 购 + 也 成为 地方 政府 的 眼中 钉 + 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 + 各地 政府 便 纷纷 跟进 + 仅 一 个 多 月 的 时间 里 + 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 + 四十六 个 限 购 城市 当中 + 四十一 个 已 正式 取消 或 变相 放松 了 限 购 + 财政 ���融 政策 紧随 其后 而来 + 显示 出 了 极 强 的 威力 + +==> data/local/lm/train <== +汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点 +并 计划 朝云 计算 方面 发展 +汉莎 的 基础 设施 部门 拥有 一千四百 名 员工 +媒体 就 曾 披露 这笔 交易 +虽然 双方 已经 正式 签署 了 外包 协议 +但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准 +陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位 +并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程 +曾 在 多家 国际 公司 任职 +拥有 业务 开发 商务 及 企业 治理 + +==> data/local/lm/unigram.counts <== + 57487 的 + 13099 在 + 11862 一 + 11397 了 + 10998 不 + 9913 是 + 7952 有 + 6250 和 + 6152 个 + 5422 将 + +==> data/local/lm/word.counts <== + 57486 的 + 13098 在 + 11861 一 + 11396 了 + 10997 不 + 9912 是 + 7951 有 + 6249 和 + 6151 个 + 5421 将 + +==> data/local/lm/wordlist <== +的 +在 +一 +了 +不 +是 +有 +和 +个 +将 +``` + +## Output + ``` fstaddselfloops 'echo 4234 |' 'echo 123660 |' Lexicon and Token FSTs compiling succeeded @@ -16,3 +159,28 @@ fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst Composing decoding graph TLG.fst succeeded Aishell build TLG done. ``` + +``` +data/ +├── lang_test +│ ├── G.fst +│ ├── L.fst +│ ├── LG.fst +│ ├── T.fst +│ ├── TLG.fst +│ ├── tokens.txt +│ ├── units.txt +│ └── words.txt +└── local + ├── lang + │ ├── L.fst + │ ├── T.fst + │ ├── tokens.txt + │ ├── units.txt + │ └── words.txt + └── tmp + ├── disambig.list + ├── lexiconp_disambig.txt + ├── lexiconp.txt + └── units.list +``` \ No newline at end of file diff --git a/speechx/examples/wfst/path.sh b/speechx/examples/wfst/path.sh index 877f2399..a07c1297 100644 --- a/speechx/examples/wfst/path.sh +++ b/speechx/examples/wfst/path.sh @@ -1,6 +1,6 @@ # This contains the locations of binarys build required for running the examples. -MAIN_ROOT=`realpath $PWD/../../../../` +MAIN_ROOT=`realpath $PWD/../../../` SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx` export LC_AL=C diff --git a/speechx/examples/wfst/run.sh b/speechx/examples/wfst/run.sh old mode 100644 new mode 100755 index b53e1a5b..1354646a --- a/speechx/examples/wfst/run.sh +++ b/speechx/examples/wfst/run.sh @@ -5,54 +5,25 @@ set -eo pipefail stage=-1 stop_stage=100 -corpus=aishell -lmtype=srilm -lexicon= # aishell/resource_aishell/lexicon.txt -text= # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt +. utils/parse_options.sh -source parse_options.sh - -if [ ! which fstprint ]; then +if ! which fstprint ; then pushd $MAIN_ROOT/tools make kaldi.done popd fi -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # 7.1 Prepare dict - unit_file=data/vocab.txt - mkdir -p data/local/dict - cp $unit_file data/local/dict/units.txt - utils/fst/prepare_dict.py \ - --unit_file $unit_file \ - --in_lexicon ${lexicon} \ - --out_lexicon data/local/dict/lexicon.txt -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # 7.2 Train lm - lm=data/local/lm - mkdir -p data/train - mkdir -p $lm - utils/manifest_key_value.py \ - --manifest_path data/manifest.train \ - --output_path data/train - utils/filter_scp.pl data/train/text \ - $text > $lm/text - if [ $lmtype == 'srilm' ];then - local/aishell_train_lms.sh - else - utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa - fi -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # 7.3 Build decoding TLG +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + # build T & L + # utils/fst/compile_lexicon_token_fst.sh utils/fst/compile_lexicon_token_fst.sh \ data/local/dict data/local/tmp data/local/lang + + # build G & LG & TLG + # utils/fst/make_tlg.sh utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; fi -echo "Aishell build TLG done." +echo "build TLG done." exit 0 diff --git a/speechx/examples/wfst/utils b/speechx/examples/wfst/utils new file mode 120000 index 00000000..256f914a --- /dev/null +++ b/speechx/examples/wfst/utils @@ -0,0 +1 @@ +../../../utils/ \ No newline at end of file From deb3ba070b3b333489f903cfa4c31ee95b45e480 Mon Sep 17 00:00:00 2001 From: qingen Date: Mon, 18 Apr 2022 11:47:34 +0800 Subject: [PATCH 12/18] [vec] update mata info, test=doc --- paddlespeech/vector/io/dataset_from_json.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddlespeech/vector/io/dataset_from_json.py b/paddlespeech/vector/io/dataset_from_json.py index 5ffd2c18..c9294971 100644 --- a/paddlespeech/vector/io/dataset_from_json.py +++ b/paddlespeech/vector/io/dataset_from_json.py @@ -26,14 +26,14 @@ from paddleaudio.compliance.librosa import mfcc class meta_info: """the audio meta info in the vector JSONDataset Args: - id (str): the segment name + utt_id (str): the segment name duration (float): segment time wav (str): wav file path start (int): start point in the original wav file stop (int): stop point in the original wav file lab_id (str): the record id """ - id: str + utt_id: str duration: float wav: str start: int From 9c0ceaacb6aafa1175b0df7372fb411e2fd772fe Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 18 Apr 2022 17:27:45 +0800 Subject: [PATCH 13/18] add streaming am infer, test=doc --- .../server/engine/tts/online/tts_engine.py | 517 ++++++++++++++++-- paddlespeech/server/utils/util.py | 4 + 2 files changed, 462 insertions(+), 59 deletions(-) diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py index 25a8bc76..8e76225d 100644 --- a/paddlespeech/server/engine/tts/online/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/tts_engine.py @@ -12,24 +12,322 @@ # See the License for the specific language governing permissions and # limitations under the License. import base64 +import math +import os import time +from typing import Optional import numpy as np import paddle +import yaml +from yacs.config import CfgNode from paddlespeech.cli.log import logger from paddlespeech.cli.tts.infer import TTSExecutor +from paddlespeech.cli.utils import download_and_decompress +from paddlespeech.cli.utils import MODEL_HOME +from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.server.engine.base_engine import BaseEngine from paddlespeech.server.utils.audio_process import float2pcm +from paddlespeech.server.utils.util import denorm from paddlespeech.server.utils.util import get_chunks +from paddlespeech.t2s.frontend import English +from paddlespeech.t2s.frontend.zh_frontend import Frontend +from paddlespeech.t2s.modules.normalizer import ZScore + +__all__ = ['TTSEngine'] + +# support online model +pretrained_models = { + # fastspeech2 + "fastspeech2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', + 'md5': + '637d28a5e53aa60275612ba4393d5f22', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_76000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "fastspeech2_cnndecoder_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip', + 'md5': + '6eb28e22ace73e0ebe7845f86478f89f', + 'config': + 'cnndecoder.yaml', + 'ckpt': + 'snapshot_iter_153000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + + # mb_melgan + "mb_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'ee5f0604e20091f0d495b6ec4618b90d', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'dd40a3d88dfcf64513fba2f0f961ada6', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, +} + +model_alias = { + # acoustic model + "fastspeech2": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2", + "fastspeech2_inference": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + + # voc + "mb_melgan": + "paddlespeech.t2s.models.melgan:MelGANGenerator", + "mb_melgan_inference": + "paddlespeech.t2s.models.melgan:MelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", +} __all__ = ['TTSEngine'] class TTSServerExecutor(TTSExecutor): - def __init__(self): + def __init__(self, am_block, am_pad, voc_block, voc_pad): super().__init__() - pass + self.am_block = am_block + self.am_pad = am_pad + self.voc_block = voc_block + self.voc_pad = voc_pad + + def get_model_info(self, step, model_name, ckpt, stat): + """get model information + + Args: + step (string): am or voc + model_name (string): model type, support fastspeech2, higigan, mb_melgan + ckpt (string): ckpt file + stat (string): stat file, including mean and standard deviation + + Returns: + model, model_mu, model_std + """ + model_class = dynamic_import(model_name, model_alias) + + if step == "am": + odim = self.am_config.n_mels + model = model_class( + idim=self.vocab_size, odim=odim, **self.am_config["model"]) + model.set_state_dict(paddle.load(ckpt)["main_params"]) + + elif step == "voc": + model = model_class(**self.voc_config["generator_params"]) + model.set_state_dict(paddle.load(ckpt)["generator_params"]) + model.remove_weight_norm() + + else: + logger.error("Please set correct step, am or voc") + + model.eval() + model_mu, model_std = np.load(stat) + model_mu = paddle.to_tensor(model_mu) + model_std = paddle.to_tensor(model_std) + + return model, model_mu, model_std + + def _get_pretrained_path(self, tag: str) -> os.PathLike: + """ + Download and returns pretrained resources path of current task. + """ + support_models = list(pretrained_models.keys()) + assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) + + res_path = os.path.join(MODEL_HOME, tag) + decompressed_path = download_and_decompress(pretrained_models[tag], + res_path) + decompressed_path = os.path.abspath(decompressed_path) + logger.info( + 'Use pretrained model stored in: {}'.format(decompressed_path)) + return decompressed_path + + def _init_from_path( + self, + am: str='fastspeech2_csmsc', + am_config: Optional[os.PathLike]=None, + am_ckpt: Optional[os.PathLike]=None, + am_stat: Optional[os.PathLike]=None, + phones_dict: Optional[os.PathLike]=None, + tones_dict: Optional[os.PathLike]=None, + speaker_dict: Optional[os.PathLike]=None, + voc: str='mb_melgan_csmsc', + voc_config: Optional[os.PathLike]=None, + voc_ckpt: Optional[os.PathLike]=None, + voc_stat: Optional[os.PathLike]=None, + lang: str='zh', ): + """ + Init model and other resources from a specific path. + """ + if hasattr(self, 'am_inference') and hasattr(self, 'voc_inference'): + logger.info('Models had been initialized.') + return + # am model info + am_tag = am + '-' + lang + if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: + am_res_path = self._get_pretrained_path(am_tag) + self.am_res_path = am_res_path + self.am_config = os.path.join(am_res_path, + pretrained_models[am_tag]['config']) + self.am_ckpt = os.path.join(am_res_path, + pretrained_models[am_tag]['ckpt']) + self.am_stat = os.path.join( + am_res_path, pretrained_models[am_tag]['speech_stats']) + # must have phones_dict in acoustic + self.phones_dict = os.path.join( + am_res_path, pretrained_models[am_tag]['phones_dict']) + print("self.phones_dict:", self.phones_dict) + logger.info(am_res_path) + logger.info(self.am_config) + logger.info(self.am_ckpt) + else: + self.am_config = os.path.abspath(am_config) + self.am_ckpt = os.path.abspath(am_ckpt) + self.am_stat = os.path.abspath(am_stat) + self.phones_dict = os.path.abspath(phones_dict) + self.am_res_path = os.path.dirname(os.path.abspath(self.am_config)) + print("self.phones_dict:", self.phones_dict) + + self.tones_dict = None + self.speaker_dict = None + + # voc model info + voc_tag = voc + '-' + lang + if voc_ckpt is None or voc_config is None or voc_stat is None: + voc_res_path = self._get_pretrained_path(voc_tag) + self.voc_res_path = voc_res_path + self.voc_config = os.path.join(voc_res_path, + pretrained_models[voc_tag]['config']) + self.voc_ckpt = os.path.join(voc_res_path, + pretrained_models[voc_tag]['ckpt']) + self.voc_stat = os.path.join( + voc_res_path, pretrained_models[voc_tag]['speech_stats']) + logger.info(voc_res_path) + logger.info(self.voc_config) + logger.info(self.voc_ckpt) + else: + self.voc_config = os.path.abspath(voc_config) + self.voc_ckpt = os.path.abspath(voc_ckpt) + self.voc_stat = os.path.abspath(voc_stat) + self.voc_res_path = os.path.dirname( + os.path.abspath(self.voc_config)) + + # Init body. + with open(self.am_config) as f: + self.am_config = CfgNode(yaml.safe_load(f)) + with open(self.voc_config) as f: + self.voc_config = CfgNode(yaml.safe_load(f)) + + with open(self.phones_dict, "r") as f: + phn_id = [line.strip().split() for line in f.readlines()] + self.vocab_size = len(phn_id) + print("vocab_size:", self.vocab_size) + + # frontend + if lang == 'zh': + self.frontend = Frontend( + phone_vocab_path=self.phones_dict, + tone_vocab_path=self.tones_dict) + + elif lang == 'en': + self.frontend = English(phone_vocab_path=self.phones_dict) + print("frontend done!") + + # am infer info + self.am_name = am[:am.rindex('_')] + if self.am_name == "fastspeech2_cnndecoder": + self.am_inference, self.am_mu, self.am_std = self.get_model_info( + "am", "fastspeech2", self.am_ckpt, self.am_stat) + else: + am, am_mu, am_std = self.get_model_info("am", self.am_name, + self.am_ckpt, self.am_stat) + am_normalizer = ZScore(am_mu, am_std) + am_inference_class = dynamic_import(self.am_name + '_inference', + model_alias) + self.am_inference = am_inference_class(am_normalizer, am) + self.am_inference.eval() + print("acoustic model done!") + + # voc infer info + self.voc_name = voc[:voc.rindex('_')] + voc, voc_mu, voc_std = self.get_model_info("voc", self.voc_name, + self.voc_ckpt, self.voc_stat) + voc_normalizer = ZScore(voc_mu, voc_std) + voc_inference_class = dynamic_import(self.voc_name + '_inference', + model_alias) + self.voc_inference = voc_inference_class(voc_normalizer, voc) + self.voc_inference.eval() + print("voc done!") + + def get_phone(self, sentence, lang, merge_sentences, get_tone_ids): + tone_ids = None + if lang == 'zh': + input_ids = self.frontend.get_input_ids( + sentence, + merge_sentences=merge_sentences, + get_tone_ids=get_tone_ids) + phone_ids = input_ids["phone_ids"] + if get_tone_ids: + tone_ids = input_ids["tone_ids"] + elif lang == 'en': + input_ids = self.frontend.get_input_ids( + sentence, merge_sentences=merge_sentences) + phone_ids = input_ids["phone_ids"] + else: + print("lang should in {'zh', 'en'}!") + + def depadding(self, data, chunk_num, chunk_id, block, pad, upsample): + """ + Streaming inference removes the result of pad inference + """ + front_pad = min(chunk_id * block, pad) + # first chunk + if chunk_id == 0: + data = data[:block * upsample] + # last chunk + elif chunk_id == chunk_num - 1: + data = data[front_pad * upsample:] + # middle chunk + else: + data = data[front_pad * upsample:(front_pad + block) * upsample] + + return data @paddle.no_grad() def infer( @@ -37,16 +335,19 @@ class TTSServerExecutor(TTSExecutor): text: str, lang: str='zh', am: str='fastspeech2_csmsc', - spk_id: int=0, - am_block: int=42, - am_pad: int=12, - voc_block: int=14, - voc_pad: int=14, ): + spk_id: int=0, ): """ Model inference and result stored in self.output. """ - am_name = am[:am.rindex('_')] - am_dataset = am[am.rindex('_') + 1:] + + am_block = self.am_block + am_pad = self.am_pad + am_upsample = 1 + voc_block = self.voc_block + voc_pad = self.voc_pad + voc_upsample = self.voc_config.n_shift + flag = 1 + get_tone_ids = False merge_sentences = False frontend_st = time.time() @@ -64,43 +365,99 @@ class TTSServerExecutor(TTSExecutor): phone_ids = input_ids["phone_ids"] else: print("lang should in {'zh', 'en'}!") - self.frontend_time = time.time() - frontend_st + frontend_et = time.time() + self.frontend_time = frontend_et - frontend_st for i in range(len(phone_ids)): - am_st = time.time() part_phone_ids = phone_ids[i] - # am - if am_name == 'speedyspeech': - part_tone_ids = tone_ids[i] - mel = self.am_inference(part_phone_ids, part_tone_ids) - # fastspeech2 + voc_chunk_id = 0 + + # fastspeech2_csmsc + if am == "fastspeech2_csmsc": + # am + mel = self.am_inference(part_phone_ids) + if flag == 1: + first_am_et = time.time() + self.first_am_infer = first_am_et - frontend_et + + # voc streaming + mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc") + voc_chunk_num = len(mel_chunks) + voc_st = time.time() + for i, mel_chunk in enumerate(mel_chunks): + sub_wav = self.voc_inference(mel_chunk) + sub_wav = self.depadding(sub_wav, voc_chunk_num, i, + voc_block, voc_pad, voc_upsample) + if flag == 1: + first_voc_et = time.time() + self.first_voc_infer = first_voc_et - first_am_et + self.first_response_time = first_voc_et - frontend_st + flag = 0 + + yield sub_wav + + # fastspeech2_cnndecoder_csmsc + elif am == "fastspeech2_cnndecoder_csmsc": + # am + orig_hs, h_masks = self.am_inference.encoder_infer( + part_phone_ids) + + # streaming voc chunk info + mel_len = orig_hs.shape[1] + voc_chunk_num = math.ceil(mel_len / self.voc_block) + start = 0 + end = min(self.voc_block + self.voc_pad, mel_len) + + # streaming am + hss = get_chunks(orig_hs, self.am_block, self.am_pad, "am") + am_chunk_num = len(hss) + for i, hs in enumerate(hss): + before_outs, _ = self.am_inference.decoder(hs) + after_outs = before_outs + self.am_inference.postnet( + before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) + normalized_mel = after_outs[0] + sub_mel = denorm(normalized_mel, self.am_mu, self.am_std) + sub_mel = self.depadding(sub_mel, am_chunk_num, i, am_block, + am_pad, am_upsample) + + if i == 0: + mel_streaming = sub_mel + else: + mel_streaming = np.concatenate( + (mel_streaming, sub_mel), axis=0) + + # streaming voc + while (mel_streaming.shape[0] >= end and + voc_chunk_id < voc_chunk_num): + if flag == 1: + first_am_et = time.time() + self.first_am_infer = first_am_et - frontend_et + voc_chunk = mel_streaming[start:end, :] + voc_chunk = paddle.to_tensor(voc_chunk) + sub_wav = self.voc_inference(voc_chunk) + + sub_wav = self.depadding(sub_wav, voc_chunk_num, + voc_chunk_id, voc_block, + voc_pad, voc_upsample) + if flag == 1: + first_voc_et = time.time() + self.first_voc_infer = first_voc_et - first_am_et + self.first_response_time = first_voc_et - frontend_st + flag = 0 + + yield sub_wav + + voc_chunk_id += 1 + start = max(0, voc_chunk_id * voc_block - voc_pad) + end = min((voc_chunk_id + 1) * voc_block + voc_pad, + mel_len) + else: - # multi speaker - if am_dataset in {"aishell3", "vctk"}: - mel = self.am_inference( - part_phone_ids, spk_id=paddle.to_tensor(spk_id)) - else: - mel = self.am_inference(part_phone_ids) - am_et = time.time() - - # voc streaming - voc_upsample = self.voc_config.n_shift - mel_chunks = get_chunks(mel, voc_block, voc_pad, "voc") - chunk_num = len(mel_chunks) - voc_st = time.time() - for i, mel_chunk in enumerate(mel_chunks): - sub_wav = self.voc_inference(mel_chunk) - front_pad = min(i * voc_block, voc_pad) - - if i == 0: - sub_wav = sub_wav[:voc_block * voc_upsample] - elif i == chunk_num - 1: - sub_wav = sub_wav[front_pad * voc_upsample:] - else: - sub_wav = sub_wav[front_pad * voc_upsample:( - front_pad + voc_block) * voc_upsample] - - yield sub_wav + logger.error( + "Only support fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc on streaming tts." + ) + + self.final_response_time = time.time() - frontend_st class TTSEngine(BaseEngine): @@ -116,11 +473,18 @@ class TTSEngine(BaseEngine): super(TTSEngine, self).__init__() def init(self, config: dict) -> bool: - self.executor = TTSServerExecutor() self.config = config - assert "fastspeech2_csmsc" in config.am and ( - config.voc == "hifigan_csmsc-zh" or config.voc == "mb_melgan_csmsc" + assert ( + config.am == "fastspeech2_csmsc" or + config.am == "fastspeech2_cnndecoder_csmsc" + ) and ( + config.voc == "hifigan_csmsc" or config.voc == "mb_melgan_csmsc" ), 'Please check config, am support: fastspeech2, voc support: hifigan_csmsc-zh or mb_melgan_csmsc.' + + assert ( + config.voc_block > 0 and config.voc_pad > 0 + ), "Please set correct voc_block and voc_pad, they should be more than 0." + try: if self.config.device: self.device = self.config.device @@ -135,6 +499,9 @@ class TTSEngine(BaseEngine): (self.device)) return False + self.executor = TTSServerExecutor(config.am_block, config.am_pad, + config.voc_block, config.voc_pad) + try: self.executor._init_from_path( am=self.config.am, @@ -155,15 +522,42 @@ class TTSEngine(BaseEngine): (self.device)) return False - self.am_block = self.config.am_block - self.am_pad = self.config.am_pad - self.voc_block = self.config.voc_block - self.voc_pad = self.config.voc_pad - logger.info("Initialize TTS server engine successfully on device: %s." % (self.device)) + + # warm up + try: + self.warm_up() + except Exception as e: + logger.error("Failed to warm up on tts engine.") + return False + return True + def warm_up(self): + """warm up + """ + if self.config.lang == 'zh': + sentence = "您好,欢迎使用语音合成服务。" + if self.config.lang == 'en': + sentence = "Hello and welcome to the speech synthesis service." + logger.info( + "*******************************warm up ********************************" + ) + for i in range(3): + for wav in self.executor.infer( + text=sentence, + lang=self.config.lang, + am=self.config.am, + spk_id=0, ): + logger.info( + f"The first response time of the {i} warm up: {self.executor.first_response_time} s" + ) + break + logger.info( + "**********************************************************************" + ) + def preprocess(self, text_bese64: str=None, text_bytes: bytes=None): # Convert byte to text if text_bese64: @@ -195,18 +589,14 @@ class TTSEngine(BaseEngine): wav_base64: The base64 format of the synthesized audio. """ - lang = self.config.lang wav_list = [] for wav in self.executor.infer( text=sentence, - lang=lang, + lang=self.config.lang, am=self.config.am, - spk_id=spk_id, - am_block=self.am_block, - am_pad=self.am_pad, - voc_block=self.voc_block, - voc_pad=self.voc_pad): + spk_id=spk_id, ): + # wav type: float32, convert to pcm (base64) wav = float2pcm(wav) # float32 to int16 wav_bytes = wav.tobytes() # to bytes @@ -216,5 +606,14 @@ class TTSEngine(BaseEngine): yield wav_base64 wav_all = np.concatenate(wav_list, axis=0) - logger.info("The durations of audio is: {} s".format( - len(wav_all) / self.executor.am_config.fs)) + duration = len(wav_all) / self.executor.am_config.fs + logger.info(f"sentence: {sentence}") + logger.info(f"The durations of audio is: {duration} s") + logger.info( + f"first response time: {self.executor.first_response_time} s") + logger.info( + f"final response time: {self.executor.final_response_time} s") + logger.info(f"RTF: {self.executor.final_response_time / duration}") + logger.info( + f"Other info: front time: {self.executor.frontend_time} s, first am infer time: {self.executor.first_am_infer} s, first voc infer time: {self.executor.first_voc_infer} s," + ) diff --git a/paddlespeech/server/utils/util.py b/paddlespeech/server/utils/util.py index 0fe70849..72ee0060 100644 --- a/paddlespeech/server/utils/util.py +++ b/paddlespeech/server/utils/util.py @@ -52,6 +52,10 @@ def get_chunks(data, block_size, pad_size, step): Returns: list: chunks list """ + + if block_size == -1: + return [data] + if step == "am": data_len = data.shape[1] elif step == "voc": From 00a6236fe2c0affa3093551c1d88f0a92b2d0a42 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Mon, 18 Apr 2022 17:31:47 +0800 Subject: [PATCH 14/18] remove test code, test=doc --- paddlespeech/server/tests/tts/infer/run.sh | 62 -- .../server/tests/tts/infer/test_online_tts.py | 610 ------------------ 2 files changed, 672 deletions(-) delete mode 100644 paddlespeech/server/tests/tts/infer/run.sh delete mode 100644 paddlespeech/server/tests/tts/infer/test_online_tts.py diff --git a/paddlespeech/server/tests/tts/infer/run.sh b/paddlespeech/server/tests/tts/infer/run.sh deleted file mode 100644 index 3733c3fb..00000000 --- a/paddlespeech/server/tests/tts/infer/run.sh +++ /dev/null @@ -1,62 +0,0 @@ -model_path=~/.paddlespeech/models/ -am_model_dir=$model_path/fastspeech2_csmsc-zh/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0/ -voc_model_dir=$model_path/mb_melgan_csmsc-zh/mb_melgan_csmsc_ckpt_0.1.1/ -testdata=../../../../t2s/exps/csmsc_test.txt - -# get am file -for file in $(ls $am_model_dir) -do - if [[ $file == *"yaml"* ]]; then - am_config_file=$file - elif [[ $file == *"pdz"* ]]; then - am_ckpt_file=$file - elif [[ $file == *"stat"* ]]; then - am_stat_file=$file - elif [[ $file == *"phone"* ]]; then - phones_dict_file=$file - fi - -done - -# get voc file -for file in $(ls $voc_model_dir) -do - if [[ $file == *"yaml"* ]]; then - voc_config_file=$file - elif [[ $file == *"pdz"* ]]; then - voc_ckpt_file=$file - elif [[ $file == *"stat"* ]]; then - voc_stat_file=$file - fi - -done - - -# run test -# am can choose fastspeech2_csmsc or fastspeech2_cnndecoder_csmsc, where fastspeech2_cnndecoder_csmsc supports streaming inference. -# voc can choose hifigan_csmsc and mb_melgan_csmsc, They can both support streaming inference. -# When am is fastspeech2_cnndecoder_csmsc and am_pad is set to 12, there is no diff between streaming and non-streaming inference results. -# When voc is mb_melgan_csmsc and voc_pad is set to 14, there is no diff between streaming and non-streaming inference results. -# When voc is hifigan_csmsc and voc_pad is set to 20, there is no diff between streaming and non-streaming inference results. - -python test_online_tts.py --am fastspeech2_cnndecoder_csmsc \ - --am_config $am_model_dir/$am_config_file \ - --am_ckpt $am_model_dir/$am_ckpt_file \ - --am_stat $am_model_dir/$am_stat_file \ - --phones_dict $am_model_dir/$phones_dict_file \ - --voc mb_melgan_csmsc \ - --voc_config $voc_model_dir/$voc_config_file \ - --voc_ckpt $voc_model_dir/$voc_ckpt_file \ - --voc_stat $voc_model_dir/$voc_stat_file \ - --lang zh \ - --device cpu \ - --text $testdata \ - --output_dir ./output \ - --log_file ./result.log \ - --am_streaming True \ - --am_pad 12 \ - --am_block 42 \ - --voc_streaming True \ - --voc_pad 14 \ - --voc_block 14 \ - diff --git a/paddlespeech/server/tests/tts/infer/test_online_tts.py b/paddlespeech/server/tests/tts/infer/test_online_tts.py deleted file mode 100644 index eb5fc80b..00000000 --- a/paddlespeech/server/tests/tts/infer/test_online_tts.py +++ /dev/null @@ -1,610 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import argparse -import logging -import math -import threading -import time -from pathlib import Path - -import numpy as np -import paddle -import soundfile as sf -import yaml -from yacs.config import CfgNode - -from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.t2s.exps.syn_utils import get_am_inference -from paddlespeech.t2s.exps.syn_utils import get_frontend -from paddlespeech.t2s.exps.syn_utils import get_sentences -from paddlespeech.t2s.exps.syn_utils import get_voc_inference -from paddlespeech.t2s.exps.syn_utils import model_alias -from paddlespeech.t2s.utils import str2bool - -mel_streaming = None -wav_streaming = None -streaming_first_time = 0.0 -streaming_voc_st = 0.0 -sample_rate = 0 - - -def denorm(data, mean, std): - return data * std + mean - - -def get_chunks(data, block_size, pad_size, step): - if step == "am": - data_len = data.shape[1] - elif step == "voc": - data_len = data.shape[0] - else: - print("Please set correct type to get chunks, am or voc") - - chunks = [] - n = math.ceil(data_len / block_size) - for i in range(n): - start = max(0, i * block_size - pad_size) - end = min((i + 1) * block_size + pad_size, data_len) - if step == "am": - chunks.append(data[:, start:end, :]) - elif step == "voc": - chunks.append(data[start:end, :]) - else: - print("Please set correct type to get chunks, am or voc") - return chunks - - -def get_streaming_am_inference(args, am_config): - with open(args.phones_dict, "r") as f: - phn_id = [line.strip().split() for line in f.readlines()] - vocab_size = len(phn_id) - print("vocab_size:", vocab_size) - - am_name = "fastspeech2" - odim = am_config.n_mels - - am_class = dynamic_import(am_name, model_alias) - am = am_class(idim=vocab_size, odim=odim, **am_config["model"]) - am.set_state_dict(paddle.load(args.am_ckpt)["main_params"]) - am.eval() - am_mu, am_std = np.load(args.am_stat) - am_mu = paddle.to_tensor(am_mu) - am_std = paddle.to_tensor(am_std) - - return am, am_mu, am_std - - -def init(args): - global sample_rate - # get config - with open(args.am_config) as f: - am_config = CfgNode(yaml.safe_load(f)) - with open(args.voc_config) as f: - voc_config = CfgNode(yaml.safe_load(f)) - - sample_rate = am_config.fs - - # frontend - frontend = get_frontend(args) - - # acoustic model - if args.am == 'fastspeech2_cnndecoder_csmsc': - am, am_mu, am_std = get_streaming_am_inference(args, am_config) - am_infer_info = [am, am_mu, am_std, am_config] - else: - am_inference, am_name, am_dataset = get_am_inference(args, am_config) - am_infer_info = [am_inference, am_name, am_dataset, am_config] - - # vocoder - voc_inference = get_voc_inference(args, voc_config) - voc_infer_info = [voc_inference, voc_config] - - return frontend, am_infer_info, voc_infer_info - - -def get_phone(args, frontend, sentence, merge_sentences, get_tone_ids): - am_name = args.am[:args.am.rindex('_')] - tone_ids = None - - if args.lang == 'zh': - input_ids = frontend.get_input_ids( - sentence, - merge_sentences=merge_sentences, - get_tone_ids=get_tone_ids) - phone_ids = input_ids["phone_ids"] - if get_tone_ids: - tone_ids = input_ids["tone_ids"] - elif args.lang == 'en': - input_ids = frontend.get_input_ids( - sentence, merge_sentences=merge_sentences) - phone_ids = input_ids["phone_ids"] - else: - print("lang should in {'zh', 'en'}!") - - return phone_ids, tone_ids - - -@paddle.no_grad() -# 生成完整的mel -def gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids): - # 如果是支持流式的AM模型 - if args.am == 'fastspeech2_cnndecoder_csmsc': - am, am_mu, am_std, am_config = am_infer_info - orig_hs, h_masks = am.encoder_infer(part_phone_ids) - if args.am_streaming: - am_pad = args.am_pad - am_block = args.am_block - hss = get_chunks(orig_hs, am_block, am_pad, "am") - chunk_num = len(hss) - mel_list = [] - for i, hs in enumerate(hss): - before_outs, _ = am.decoder(hs) - after_outs = before_outs + am.postnet( - before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - normalized_mel = after_outs[0] - sub_mel = denorm(normalized_mel, am_mu, am_std) - # clip output part of pad - if i == 0: - sub_mel = sub_mel[:-am_pad] - elif i == chunk_num - 1: - # 最后一块的右侧一定没有 pad 够 - sub_mel = sub_mel[am_pad:] - else: - # 倒数几块的右侧也可能没有 pad 够 - sub_mel = sub_mel[am_pad:(am_block + am_pad) - - sub_mel.shape[0]] - mel_list.append(sub_mel) - mel = paddle.concat(mel_list, axis=0) - - else: - orig_hs, h_masks = am.encoder_infer(part_phone_ids) - before_outs, _ = am.decoder(orig_hs) - after_outs = before_outs + am.postnet( - before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - normalized_mel = after_outs[0] - mel = denorm(normalized_mel, am_mu, am_std) - - else: - am_inference, am_name, am_dataset, am_config = am_infer_info - mel = am_inference(part_phone_ids) - - return mel - - -@paddle.no_grad() -def streaming_voc_infer(args, voc_infer_info, mel_len): - global mel_streaming - global streaming_first_time - global wav_streaming - voc_inference, voc_config = voc_infer_info - block = args.voc_block - pad = args.voc_pad - upsample = voc_config.n_shift - wav_list = [] - flag = 1 - - valid_start = 0 - valid_end = min(valid_start + block, mel_len) - actual_start = 0 - actual_end = min(valid_end + pad, mel_len) - mel_chunk = mel_streaming[actual_start:actual_end, :] - - while valid_end <= mel_len: - sub_wav = voc_inference(mel_chunk) - if flag == 1: - streaming_first_time = time.time() - flag = 0 - - # get valid wav - start = valid_start - actual_start - if valid_end == mel_len: - sub_wav = sub_wav[start * upsample:] - wav_list.append(sub_wav) - break - else: - end = start + block - sub_wav = sub_wav[start * upsample:end * upsample] - wav_list.append(sub_wav) - - # generate new mel chunk - valid_start = valid_end - valid_end = min(valid_start + block, mel_len) - if valid_start - pad < 0: - actual_start = 0 - else: - actual_start = valid_start - pad - actual_end = min(valid_end + pad, mel_len) - mel_chunk = mel_streaming[actual_start:actual_end, :] - - wav = paddle.concat(wav_list, axis=0) - wav_streaming = wav - - -@paddle.no_grad() -# 非流式AM / 流式AM + 非流式Voc -def am_nonstreaming_voc(args, am_infer_info, voc_infer_info, part_phone_ids, - part_tone_ids): - mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) - am_infer_time = time.time() - voc_inference, voc_config = voc_infer_info - wav = voc_inference(mel) - first_response_time = time.time() - final_response_time = first_response_time - voc_infer_time = first_response_time - - return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav - - -@paddle.no_grad() -# 非流式AM + 流式Voc -def nonstreaming_am_streaming_voc(args, am_infer_info, voc_infer_info, - part_phone_ids, part_tone_ids): - global mel_streaming - global streaming_first_time - global wav_streaming - - mel = gen_mel(args, am_infer_info, part_phone_ids, part_tone_ids) - am_infer_time = time.time() - - # voc streaming - mel_streaming = mel - mel_len = mel.shape[0] - streaming_voc_infer(args, voc_infer_info, mel_len) - first_response_time = streaming_first_time - wav = wav_streaming - final_response_time = time.time() - voc_infer_time = final_response_time - - return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav - - -@paddle.no_grad() -# 流式AM + 流式 Voc -def streaming_am_streaming_voc(args, am_infer_info, voc_infer_info, - part_phone_ids, part_tone_ids): - global mel_streaming - global streaming_first_time - global wav_streaming - global streaming_voc_st - mel_streaming = None - #用来表示开启流式voc的线程 - flag = 1 - - am, am_mu, am_std, am_config = am_infer_info - orig_hs, h_masks = am.encoder_infer(part_phone_ids) - mel_len = orig_hs.shape[1] - am_block = args.am_block - am_pad = args.am_pad - hss = get_chunks(orig_hs, am_block, am_pad, "am") - chunk_num = len(hss) - - for i, hs in enumerate(hss): - before_outs, _ = am.decoder(hs) - after_outs = before_outs + am.postnet( - before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) - normalized_mel = after_outs[0] - sub_mel = denorm(normalized_mel, am_mu, am_std) - # clip output part of pad - if i == 0: - sub_mel = sub_mel[:-am_pad] - mel_streaming = sub_mel - elif i == chunk_num - 1: - # 最后一块的右侧一定没有 pad 够 - sub_mel = sub_mel[am_pad:] - mel_streaming = paddle.concat([mel_streaming, sub_mel]) - am_infer_time = time.time() - else: - # 倒数几块的右侧也可能没有 pad 够 - sub_mel = sub_mel[am_pad:(am_block + am_pad) - sub_mel.shape[0]] - mel_streaming = paddle.concat([mel_streaming, sub_mel]) - - if flag and mel_streaming.shape[0] > args.voc_block + args.voc_pad: - t = threading.Thread( - target=streaming_voc_infer, - args=(args, voc_infer_info, mel_len, )) - t.start() - streaming_voc_st = time.time() - flag = 0 - - t.join() - final_response_time = time.time() - voc_infer_time = final_response_time - first_response_time = streaming_first_time - wav = wav_streaming - - return am_infer_time, voc_infer_time, first_response_time, final_response_time, wav - - -def warm_up(args, logger, frontend, am_infer_info, voc_infer_info): - global sample_rate - logger.info( - "Before the formal test, we test a few texts to make the inference speed more stable." - ) - if args.lang == 'zh': - sentence = "您好,欢迎使用语音合成服务。" - if args.lang == 'en': - sentence = "Hello and welcome to the speech synthesis service." - - if args.voc_streaming: - if args.am_streaming: - infer_func = streaming_am_streaming_voc - else: - infer_func = nonstreaming_am_streaming_voc - else: - infer_func = am_nonstreaming_voc - - merge_sentences = True - get_tone_ids = False - for i in range(5): # 推理5次 - st = time.time() - phone_ids, tone_ids = get_phone(args, frontend, sentence, - merge_sentences, get_tone_ids) - part_phone_ids = phone_ids[0] - if tone_ids: - part_tone_ids = tone_ids[0] - else: - part_tone_ids = None - - am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func( - args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) - wav = wav.numpy() - duration = wav.size / sample_rate - logger.info( - f"sentence: {sentence}; duration: {duration} s; first response time: {first_response_time - st} s; final response time: {final_response_time - st} s" - ) - - -def evaluate(args, logger, frontend, am_infer_info, voc_infer_info): - global sample_rate - sentences = get_sentences(args) - - output_dir = Path(args.output_dir) - output_dir.mkdir(parents=True, exist_ok=True) - get_tone_ids = False - merge_sentences = True - - # choose infer function - if args.voc_streaming: - if args.am_streaming: - infer_func = streaming_am_streaming_voc - else: - infer_func = nonstreaming_am_streaming_voc - else: - infer_func = am_nonstreaming_voc - - final_up_duration = 0.0 - sentence_count = 0 - front_time_list = [] - am_time_list = [] - voc_time_list = [] - first_response_list = [] - final_response_list = [] - sentence_length_list = [] - duration_list = [] - - for utt_id, sentence in sentences: - # front - front_st = time.time() - phone_ids, tone_ids = get_phone(args, frontend, sentence, - merge_sentences, get_tone_ids) - part_phone_ids = phone_ids[0] - if tone_ids: - part_tone_ids = tone_ids[0] - else: - part_tone_ids = None - front_et = time.time() - front_time = front_et - front_st - - am_st = time.time() - am_infer_time, voc_infer_time, first_response_time, final_response_time, wav = infer_func( - args, am_infer_info, voc_infer_info, part_phone_ids, part_tone_ids) - am_time = am_infer_time - am_st - if args.voc_streaming and args.am_streaming: - voc_time = voc_infer_time - streaming_voc_st - else: - voc_time = voc_infer_time - am_infer_time - - first_response = first_response_time - front_st - final_response = final_response_time - front_st - - wav = wav.numpy() - duration = wav.size / sample_rate - sf.write( - str(output_dir / (utt_id + ".wav")), wav, samplerate=sample_rate) - print(f"{utt_id} done!") - - sentence_count += 1 - front_time_list.append(front_time) - am_time_list.append(am_time) - voc_time_list.append(voc_time) - first_response_list.append(first_response) - final_response_list.append(final_response) - sentence_length_list.append(len(sentence)) - duration_list.append(duration) - - logger.info( - f"uttid: {utt_id}; sentence: '{sentence}'; front time: {front_time} s; am time: {am_time} s; voc time: {voc_time} s; \ - first response time: {first_response} s; final response time: {final_response} s; audio duration: {duration} s;" - ) - - if final_response > duration: - final_up_duration += 1 - - all_time_sum = sum(final_response_list) - front_rate = sum(front_time_list) / all_time_sum - am_rate = sum(am_time_list) / all_time_sum - voc_rate = sum(voc_time_list) / all_time_sum - rtf = all_time_sum / sum(duration_list) - - logger.info( - f"The length of test text information, test num: {sentence_count}; text num: {sum(sentence_length_list)}; min: {min(sentence_length_list)}; max: {max(sentence_length_list)}; avg: {sum(sentence_length_list)/len(sentence_length_list)}" - ) - logger.info( - f"duration information, min: {min(duration_list)}; max: {max(duration_list)}; avg: {sum(duration_list) / len(duration_list)}; sum: {sum(duration_list)}" - ) - logger.info( - f"Front time information: min: {min(front_time_list)} s; max: {max(front_time_list)} s; avg: {sum(front_time_list)/len(front_time_list)} s; ratio: {front_rate * 100}%" - ) - logger.info( - f"AM time information: min: {min(am_time_list)} s; max: {max(am_time_list)} s; avg: {sum(am_time_list)/len(am_time_list)} s; ratio: {am_rate * 100}%" - ) - logger.info( - f"Vocoder time information: min: {min(voc_time_list)} s, max: {max(voc_time_list)} s; avg: {sum(voc_time_list)/len(voc_time_list)} s; ratio: {voc_rate * 100}%" - ) - logger.info( - f"first response time information: min: {min(first_response_list)} s; max: {max(first_response_list)} s; avg: {sum(first_response_list)/len(first_response_list)} s" - ) - logger.info( - f"final response time information: min: {min(final_response_list)} s; max: {max(final_response_list)} s; avg: {sum(final_response_list)/len(final_response_list)} s" - ) - logger.info(f"RTF is: {rtf}") - logger.info( - f"The number of final_response is greater than duration is {final_up_duration}, ratio: {final_up_duration / sentence_count}%" - ) - - -def parse_args(): - # parse args and config and redirect to train_sp - parser = argparse.ArgumentParser( - description="Synthesize with acoustic model & vocoder") - # acoustic model - parser.add_argument( - '--am', - type=str, - default='fastspeech2_csmsc', - choices=['fastspeech2_csmsc', 'fastspeech2_cnndecoder_csmsc'], - help='Choose acoustic model type of tts task. where fastspeech2_cnndecoder_csmsc supports streaming inference' - ) - - parser.add_argument( - '--am_config', - type=str, - default=None, - help='Config of acoustic model. Use deault config when it is None.') - parser.add_argument( - '--am_ckpt', - type=str, - default=None, - help='Checkpoint file of acoustic model.') - parser.add_argument( - "--am_stat", - type=str, - default=None, - help="mean and standard deviation used to normalize spectrogram when training acoustic model." - ) - parser.add_argument( - "--phones_dict", type=str, default=None, help="phone vocabulary file.") - parser.add_argument( - "--tones_dict", type=str, default=None, help="tone vocabulary file.") - # vocoder - parser.add_argument( - '--voc', - type=str, - default='mb_melgan_csmsc', - choices=['mb_melgan_csmsc', 'hifigan_csmsc'], - help='Choose vocoder type of tts task.') - parser.add_argument( - '--voc_config', - type=str, - default=None, - help='Config of voc. Use deault config when it is None.') - parser.add_argument( - '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.') - parser.add_argument( - "--voc_stat", - type=str, - default=None, - help="mean and standard deviation used to normalize spectrogram when training voc." - ) - # other - parser.add_argument( - '--lang', - type=str, - default='zh', - choices=['zh', 'en'], - help='Choose model language. zh or en') - - parser.add_argument( - "--device", type=str, default='cpu', help="set cpu or gpu:id") - - parser.add_argument( - "--text", - type=str, - default="./csmsc_test.txt", - help="text to synthesize, a 'utt_id sentence' pair per line.") - parser.add_argument("--output_dir", type=str, help="output dir.") - parser.add_argument( - "--log_file", type=str, default="result.log", help="log file.") - - parser.add_argument( - "--am_streaming", - type=str2bool, - default=False, - help="whether use streaming acoustic model") - - parser.add_argument("--am_pad", type=int, default=12, help="am pad size.") - - parser.add_argument( - "--am_block", type=int, default=42, help="am block size.") - - parser.add_argument( - "--voc_streaming", - type=str2bool, - default=False, - help="whether use streaming vocoder model") - - parser.add_argument("--voc_pad", type=int, default=14, help="voc pad size.") - - parser.add_argument( - "--voc_block", type=int, default=14, help="voc block size.") - - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - paddle.set_device(args.device) - if args.am_streaming: - assert (args.am == 'fastspeech2_cnndecoder_csmsc') - - logger = logging.getLogger() - fhandler = logging.FileHandler(filename=args.log_file, mode='w') - formatter = logging.Formatter( - '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' - ) - fhandler.setFormatter(formatter) - logger.addHandler(fhandler) - logger.setLevel(logging.DEBUG) - - # set basic information - logger.info( - f"AM: {args.am}; Vocoder: {args.voc}; device: {args.device}; am streaming: {args.am_streaming}; voc streaming: {args.voc_streaming}" - ) - logger.info( - f"am pad size: {args.am_pad}; am block size: {args.am_block}; voc pad size: {args.voc_pad}; voc block size: {args.voc_block};" - ) - - # get information about model - frontend, am_infer_info, voc_infer_info = init(args) - logger.info( - "************************ warm up *********************************") - warm_up(args, logger, frontend, am_infer_info, voc_infer_info) - logger.info( - "************************ normal test *******************************") - evaluate(args, logger, frontend, am_infer_info, voc_infer_info) - - -if __name__ == "__main__": - main() From 0ede6c2ee747100552a29cbcd9ef8ca72427527c Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 18 Apr 2022 12:47:09 +0000 Subject: [PATCH 15/18] train lm --- .../other/ngram_lm/s0/local/download_lm_zh.sh | 5 ++++ .../ngram/zh/local/aishell_train_lms.sh | 8 +++--- .../ngram/zh/local/text_to_lexicon.py | 12 ++++++--- speechx/examples/ngram/zh/run.sh | 22 +++++++++------- utils/fst/prepare_dict.py | 26 ++++++++++++++++--- 5 files changed, 52 insertions(+), 21 deletions(-) diff --git a/examples/other/ngram_lm/s0/local/download_lm_zh.sh b/examples/other/ngram_lm/s0/local/download_lm_zh.sh index f9e2261f..050749ce 100755 --- a/examples/other/ngram_lm/s0/local/download_lm_zh.sh +++ b/examples/other/ngram_lm/s0/local/download_lm_zh.sh @@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3" TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm +if [ -e $TARGET ];then + echo "already have lm" + exit 0; +fi + echo "Download language model ..." download $URL $MD5 $TARGET if [ $? -ne 0 ]; then diff --git a/speechx/examples/ngram/zh/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh index e3cee438..76266151 100755 --- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh +++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh @@ -29,12 +29,13 @@ mkdir -p $dir cleantext=$dir/text.no_oov # oov to -# line: utt word0 ... wordn -> line: word0 ... wordn +# lexicon line: word char0 ... charn +# text line: utt word0 ... wordn -> line: word0 ... wordn cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ > $cleantext || exit 1; -# compute word counts +# compute word counts, sort in descending order # line: count word cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ sort -nr > $dir/word.counts || exit 1; @@ -50,8 +51,7 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist # hold out to compute ppl -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results +heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results mkdir -p $dir cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n line: word char0 ... charn - utils/fst/prepare_dict.py \ - --unit_file $unit \ - --in_lexicon ${lexicon} \ - --out_lexicon data/local/dict/lexicon.txt - else - local/text_to_lexicon.py --has_key true --text $text --lexicon data/local/dict/lexicon.txt + if [ ! -f $lexicon ];then + local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon + echo "Generate $lexicon from $text" fi + + # filter by vocab + # line: word ph0 ... phn -> line: word char0 ... charn + utils/fst/prepare_dict.py \ + --unit_file $unit \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt fi lm=data/local/lm diff --git a/utils/fst/prepare_dict.py b/utils/fst/prepare_dict.py index f59cd311..301d72fb 100755 --- a/utils/fst/prepare_dict.py +++ b/utils/fst/prepare_dict.py @@ -3,7 +3,8 @@ import argparse def main(args): - # load `unit` or `vocab` file + # load vocab file + # line: token unit_table = set() with open(args.unit_file, 'r') as fin: for line in fin: @@ -11,27 +12,41 @@ def main(args): unit_table.add(unit) def contain_oov(units): + """token not in vocab + + Args: + units (str): token + + Returns: + bool: True token in voca, else False. + """ for unit in units: if unit not in unit_table: return True return False - # load spm model + # load spm model, for English bpemode = args.bpemodel if bpemode: import sentencepiece as spm sp = spm.SentencePieceProcessor() sp.Load(sys.bpemodel) - # used to filter polyphone + # used to filter polyphone and invalid word lexicon_table = set() + in_n = 0 # in lexicon word count + out_n = 0 # out lexicon word cout with open(args.in_lexicon, 'r') as fin, \ open(args.out_lexicon, 'w') as fout: for line in fin: word = line.split()[0] + in_n += 1 + if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel + # filter 'SIL' for mandarin, keep it in English continue elif word == '': + # filter continue else: # each word only has one pronunciation for e2e system @@ -39,12 +54,14 @@ def main(args): continue if bpemode: + # for english pieces = sp.EncodeAsPieces(word) if contain_oov(pieces): print('Ignoring words {}, which contains oov unit'. format(''.join(word).strip('▁'))) continue + # word is piece list, which not have piece, filter out by `contain_oov(pieces)` chars = ' '.join( [p if p in unit_table else '' for p in pieces]) else: @@ -58,11 +75,14 @@ def main(args): # we assume the model unit of our e2e system is char now. if word.encode('utf8').isalpha() and '▁' in unit_table: word = '▁' + word + chars = ' '.join(word) # word is a char list fout.write('{} {}\n'.format(word, chars)) lexicon_table.add(word) + out_n += 1 + print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}") if __name__ == '__main__': parser = argparse.ArgumentParser( From 40dde22fc48f41cffdace68847ccbeb00cc1cef4 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Tue, 19 Apr 2022 12:59:48 +0800 Subject: [PATCH 16/18] code format, test=doc --- .../server/engine/tts/online/tts_engine.py | 44 ++++++++++++------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py index 8e76225d..a84644e7 100644 --- a/paddlespeech/server/engine/tts/online/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/tts_engine.py @@ -127,33 +127,40 @@ class TTSServerExecutor(TTSExecutor): self.voc_block = voc_block self.voc_pad = voc_pad - def get_model_info(self, step, model_name, ckpt, stat): + def get_model_info(self, + field: str, + model_name: str, + ckpt: Optional[os.PathLike], + stat: Optional[os.PathLike]): """get model information Args: - step (string): am or voc - model_name (string): model type, support fastspeech2, higigan, mb_melgan - ckpt (string): ckpt file - stat (string): stat file, including mean and standard deviation + field (str): am or voc + model_name (str): model type, support fastspeech2, higigan, mb_melgan + ckpt (Optional[os.PathLike]): ckpt file + stat (Optional[os.PathLike]): stat file, including mean and standard deviation Returns: - model, model_mu, model_std + [module]: model module + [Tensor]: mean + [Tensor]: standard deviation """ + model_class = dynamic_import(model_name, model_alias) - if step == "am": + if field == "am": odim = self.am_config.n_mels model = model_class( idim=self.vocab_size, odim=odim, **self.am_config["model"]) model.set_state_dict(paddle.load(ckpt)["main_params"]) - elif step == "voc": + elif field == "voc": model = model_class(**self.voc_config["generator_params"]) model.set_state_dict(paddle.load(ckpt)["generator_params"]) model.remove_weight_norm() else: - logger.error("Please set correct step, am or voc") + logger.error("Please set correct field, am or voc") model.eval() model_mu, model_std = np.load(stat) @@ -346,7 +353,8 @@ class TTSServerExecutor(TTSExecutor): voc_block = self.voc_block voc_pad = self.voc_pad voc_upsample = self.voc_config.n_shift - flag = 1 + # first_flag 用于标记首包 + first_flag = 1 get_tone_ids = False merge_sentences = False @@ -376,7 +384,7 @@ class TTSServerExecutor(TTSExecutor): if am == "fastspeech2_csmsc": # am mel = self.am_inference(part_phone_ids) - if flag == 1: + if first_flag == 1: first_am_et = time.time() self.first_am_infer = first_am_et - frontend_et @@ -388,11 +396,11 @@ class TTSServerExecutor(TTSExecutor): sub_wav = self.voc_inference(mel_chunk) sub_wav = self.depadding(sub_wav, voc_chunk_num, i, voc_block, voc_pad, voc_upsample) - if flag == 1: + if first_flag == 1: first_voc_et = time.time() self.first_voc_infer = first_voc_et - first_am_et self.first_response_time = first_voc_et - frontend_st - flag = 0 + first_flag = 0 yield sub_wav @@ -427,9 +435,10 @@ class TTSServerExecutor(TTSExecutor): (mel_streaming, sub_mel), axis=0) # streaming voc + # 当流式AM推理的mel帧数大于流式voc推理的chunk size,开始进行流式voc 推理 while (mel_streaming.shape[0] >= end and voc_chunk_id < voc_chunk_num): - if flag == 1: + if first_flag == 1: first_am_et = time.time() self.first_am_infer = first_am_et - frontend_et voc_chunk = mel_streaming[start:end, :] @@ -439,11 +448,11 @@ class TTSServerExecutor(TTSExecutor): sub_wav = self.depadding(sub_wav, voc_chunk_num, voc_chunk_id, voc_block, voc_pad, voc_upsample) - if flag == 1: + if first_flag == 1: first_voc_et = time.time() self.first_voc_infer = first_voc_et - first_am_et self.first_response_time = first_voc_et - frontend_st - flag = 0 + first_flag = 0 yield sub_wav @@ -470,7 +479,8 @@ class TTSEngine(BaseEngine): def __init__(self, name=None): """Initialize TTS server engine """ - super(TTSEngine, self).__init__() + #super(TTSEngine, self).__init__() + super().__init__() def init(self, config: dict) -> bool: self.config = config From a44f5c099e3b112dda7b53e9491e897e4e8f38cf Mon Sep 17 00:00:00 2001 From: TianYuan Date: Tue, 19 Apr 2022 06:31:04 +0000 Subject: [PATCH 17/18] update cli, test=doc --- paddlespeech/cli/asr/infer.py | 123 +------ paddlespeech/cli/asr/pretrained_models.py | 95 +++++ paddlespeech/cli/cls/infer.py | 77 +--- paddlespeech/cli/cls/pretrained_models.py | 47 +++ paddlespeech/cli/executor.py | 34 +- paddlespeech/cli/st/infer.py | 55 +-- paddlespeech/cli/st/pretrained_models.py | 35 ++ paddlespeech/cli/stats/infer.py | 111 ++---- paddlespeech/cli/text/infer.py | 88 +---- paddlespeech/cli/text/pretrained_models.py | 54 +++ paddlespeech/cli/tts/infer.py | 349 ++----------------- paddlespeech/cli/tts/pretrained_models.py | 300 ++++++++++++++++ paddlespeech/cli/vector/infer.py | 70 +--- paddlespeech/cli/vector/pretrained_models.py | 36 ++ tests/unit/cli/test_cli.sh | 26 +- 15 files changed, 728 insertions(+), 772 deletions(-) create mode 100644 paddlespeech/cli/asr/pretrained_models.py create mode 100644 paddlespeech/cli/cls/pretrained_models.py create mode 100644 paddlespeech/cli/st/pretrained_models.py create mode 100644 paddlespeech/cli/text/pretrained_models.py create mode 100644 paddlespeech/cli/tts/pretrained_models.py create mode 100644 paddlespeech/cli/vector/pretrained_models.py diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index b12b9f6f..4b63e1e3 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -29,9 +29,10 @@ from ..download import get_path_from_url from ..executor import BaseExecutor from ..log import logger from ..utils import cli_register -from ..utils import download_and_decompress from ..utils import MODEL_HOME from ..utils import stats_wrapper +from .pretrained_models import model_alias +from .pretrained_models import pretrained_models from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.utils.dynamic_import import dynamic_import @@ -39,94 +40,14 @@ from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ['ASRExecutor'] -pretrained_models = { - # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". - # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k". - # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: - # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" - "conformer_wenetspeech-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz', - 'md5': - '76cb19ed857e6623856b7cd7ebbfeda4', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/conformer/checkpoints/wenetspeech', - }, - "transformer_librispeech-en-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz', - 'md5': - '2c667da24922aad391eacafe37bc1660', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/transformer/checkpoints/avg_10', - }, - "deepspeech2offline_aishell-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', - 'md5': - '932c3593d62fe5c741b59b31318aa314', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/deepspeech2/checkpoints/avg_1', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', - 'lm_md5': - '29e02312deb2e59b3c8686c7966d4fe3' - }, - "deepspeech2online_aishell-zh-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', - 'md5': - '23e16c69730a1cb5d735c98c83c21e16', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/deepspeech2_online/checkpoints/avg_1', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', - 'lm_md5': - '29e02312deb2e59b3c8686c7966d4fe3' - }, - "deepspeech2offline_librispeech-en-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz', - 'md5': - 'f5666c81ad015c8de03aac2bc92e5762', - 'cfg_path': - 'model.yaml', - 'ckpt_path': - 'exp/deepspeech2/checkpoints/avg_1', - 'lm_url': - 'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm', - 'lm_md5': - '099a601759d467cd0a8523ff939819c5' - }, -} - -model_alias = { - "deepspeech2offline": - "paddlespeech.s2t.models.ds2:DeepSpeech2Model", - "deepspeech2online": - "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline", - "conformer": - "paddlespeech.s2t.models.u2:U2Model", - "transformer": - "paddlespeech.s2t.models.u2:U2Model", - "wenetspeech": - "paddlespeech.s2t.models.u2:U2Model", -} - @cli_register( name='paddlespeech.asr', description='Speech to text infer command.') class ASRExecutor(BaseExecutor): def __init__(self): - super(ASRExecutor, self).__init__() + super().__init__() + self.model_alias = model_alias + self.pretrained_models = pretrained_models self.parser = argparse.ArgumentParser( prog='paddlespeech.asr', add_help=True) @@ -136,7 +57,9 @@ class ASRExecutor(BaseExecutor): '--model', type=str, default='conformer_wenetspeech', - choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], + choices=[ + tag[:tag.index('-')] for tag in self.pretrained_models.keys() + ], help='Choose model type of asr task.') self.parser.add_argument( '--lang', @@ -192,23 +115,6 @@ class ASRExecutor(BaseExecutor): action='store_true', help='Increase logger verbosity of current task.') - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - - return decompressed_path - def _init_from_path(self, model_type: str='wenetspeech', lang: str='zh', @@ -228,10 +134,11 @@ class ASRExecutor(BaseExecutor): tag = model_type + '-' + lang + '-' + sample_rate_str res_path = self._get_pretrained_path(tag) # wenetspeech_zh self.res_path = res_path - self.cfg_path = os.path.join(res_path, - pretrained_models[tag]['cfg_path']) + self.cfg_path = os.path.join( + res_path, self.pretrained_models[tag]['cfg_path']) self.ckpt_path = os.path.join( - res_path, pretrained_models[tag]['ckpt_path'] + ".pdparams") + res_path, + self.pretrained_models[tag]['ckpt_path'] + ".pdparams") logger.info(res_path) logger.info(self.cfg_path) logger.info(self.ckpt_path) @@ -255,8 +162,8 @@ class ASRExecutor(BaseExecutor): self.collate_fn_test = SpeechCollator.from_config(self.config) self.text_feature = TextFeaturizer( unit_type=self.config.unit_type, vocab=self.vocab) - lm_url = pretrained_models[tag]['lm_url'] - lm_md5 = pretrained_models[tag]['lm_md5'] + lm_url = self.pretrained_models[tag]['lm_url'] + lm_md5 = self.pretrained_models[tag]['lm_md5'] self.download_lm( lm_url, os.path.dirname(self.config.decode.lang_model_path), lm_md5) @@ -274,7 +181,7 @@ class ASRExecutor(BaseExecutor): raise Exception("wrong type") model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} - model_class = dynamic_import(model_name, model_alias) + model_class = dynamic_import(model_name, self.model_alias) model_conf = self.config model = model_class.from_config(model_conf) self.model = model diff --git a/paddlespeech/cli/asr/pretrained_models.py b/paddlespeech/cli/asr/pretrained_models.py new file mode 100644 index 00000000..a16c4750 --- /dev/null +++ b/paddlespeech/cli/asr/pretrained_models.py @@ -0,0 +1,95 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". + # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k". + # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: + # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" + "conformer_wenetspeech-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz', + 'md5': + '76cb19ed857e6623856b7cd7ebbfeda4', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/conformer/checkpoints/wenetspeech', + }, + "transformer_librispeech-en-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz', + 'md5': + '2c667da24922aad391eacafe37bc1660', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/transformer/checkpoints/avg_10', + }, + "deepspeech2offline_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz', + 'md5': + '932c3593d62fe5c741b59b31318aa314', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2/checkpoints/avg_1', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, + "deepspeech2online_aishell-zh-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz', + 'md5': + '23e16c69730a1cb5d735c98c83c21e16', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2_online/checkpoints/avg_1', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm', + 'lm_md5': + '29e02312deb2e59b3c8686c7966d4fe3' + }, + "deepspeech2offline_librispeech-en-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz', + 'md5': + 'f5666c81ad015c8de03aac2bc92e5762', + 'cfg_path': + 'model.yaml', + 'ckpt_path': + 'exp/deepspeech2/checkpoints/avg_1', + 'lm_url': + 'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm', + 'lm_md5': + '099a601759d467cd0a8523ff939819c5' + }, +} + +model_alias = { + "deepspeech2offline": + "paddlespeech.s2t.models.ds2:DeepSpeech2Model", + "deepspeech2online": + "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline", + "conformer": + "paddlespeech.s2t.models.u2:U2Model", + "transformer": + "paddlespeech.s2t.models.u2:U2Model", + "wenetspeech": + "paddlespeech.s2t.models.u2:U2Model", +} diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py index f56d8a57..1f637a8f 100644 --- a/paddlespeech/cli/cls/infer.py +++ b/paddlespeech/cli/cls/infer.py @@ -25,55 +25,23 @@ import yaml from ..executor import BaseExecutor from ..log import logger from ..utils import cli_register -from ..utils import download_and_decompress -from ..utils import MODEL_HOME from ..utils import stats_wrapper +from .pretrained_models import model_alias +from .pretrained_models import pretrained_models from paddleaudio import load from paddleaudio.features import LogMelSpectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import __all__ = ['CLSExecutor'] -pretrained_models = { - # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". - # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k". - # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: - # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" - "panns_cnn6-32k": { - 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz', - 'md5': '4cf09194a95df024fd12f84712cf0f9c', - 'cfg_path': 'panns.yaml', - 'ckpt_path': 'cnn6.pdparams', - 'label_file': 'audioset_labels.txt', - }, - "panns_cnn10-32k": { - 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz', - 'md5': 'cb8427b22176cc2116367d14847f5413', - 'cfg_path': 'panns.yaml', - 'ckpt_path': 'cnn10.pdparams', - 'label_file': 'audioset_labels.txt', - }, - "panns_cnn14-32k": { - 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz', - 'md5': 'e3b9b5614a1595001161d0ab95edee97', - 'cfg_path': 'panns.yaml', - 'ckpt_path': 'cnn14.pdparams', - 'label_file': 'audioset_labels.txt', - }, -} - -model_alias = { - "panns_cnn6": "paddlespeech.cls.models.panns:CNN6", - "panns_cnn10": "paddlespeech.cls.models.panns:CNN10", - "panns_cnn14": "paddlespeech.cls.models.panns:CNN14", -} - @cli_register( name='paddlespeech.cls', description='Audio classification infer command.') class CLSExecutor(BaseExecutor): def __init__(self): - super(CLSExecutor, self).__init__() + super().__init__() + self.model_alias = model_alias + self.pretrained_models = pretrained_models self.parser = argparse.ArgumentParser( prog='paddlespeech.cls', add_help=True) @@ -83,7 +51,9 @@ class CLSExecutor(BaseExecutor): '--model', type=str, default='panns_cnn14', - choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], + choices=[ + tag[:tag.index('-')] for tag in self.pretrained_models.keys() + ], help='Choose model type of cls task.') self.parser.add_argument( '--config', @@ -121,23 +91,6 @@ class CLSExecutor(BaseExecutor): action='store_true', help='Increase logger verbosity of current task.') - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - - return decompressed_path - def _init_from_path(self, model_type: str='panns_cnn14', cfg_path: Optional[os.PathLike]=None, @@ -153,12 +106,12 @@ class CLSExecutor(BaseExecutor): if label_file is None or ckpt_path is None: tag = model_type + '-' + '32k' # panns_cnn14-32k self.res_path = self._get_pretrained_path(tag) - self.cfg_path = os.path.join(self.res_path, - pretrained_models[tag]['cfg_path']) - self.label_file = os.path.join(self.res_path, - pretrained_models[tag]['label_file']) - self.ckpt_path = os.path.join(self.res_path, - pretrained_models[tag]['ckpt_path']) + self.cfg_path = os.path.join( + self.res_path, self.pretrained_models[tag]['cfg_path']) + self.label_file = os.path.join( + self.res_path, self.pretrained_models[tag]['label_file']) + self.ckpt_path = os.path.join( + self.res_path, self.pretrained_models[tag]['ckpt_path']) else: self.cfg_path = os.path.abspath(cfg_path) self.label_file = os.path.abspath(label_file) @@ -175,7 +128,7 @@ class CLSExecutor(BaseExecutor): self._label_list.append(line.strip()) # model - model_class = dynamic_import(model_type, model_alias) + model_class = dynamic_import(model_type, self.model_alias) model_dict = paddle.load(self.ckpt_path) self.model = model_class(extract_embedding=False) self.model.set_state_dict(model_dict) diff --git a/paddlespeech/cli/cls/pretrained_models.py b/paddlespeech/cli/cls/pretrained_models.py new file mode 100644 index 00000000..1d66850a --- /dev/null +++ b/paddlespeech/cli/cls/pretrained_models.py @@ -0,0 +1,47 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". + # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k". + # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: + # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" + "panns_cnn6-32k": { + 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz', + 'md5': '4cf09194a95df024fd12f84712cf0f9c', + 'cfg_path': 'panns.yaml', + 'ckpt_path': 'cnn6.pdparams', + 'label_file': 'audioset_labels.txt', + }, + "panns_cnn10-32k": { + 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz', + 'md5': 'cb8427b22176cc2116367d14847f5413', + 'cfg_path': 'panns.yaml', + 'ckpt_path': 'cnn10.pdparams', + 'label_file': 'audioset_labels.txt', + }, + "panns_cnn14-32k": { + 'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz', + 'md5': 'e3b9b5614a1595001161d0ab95edee97', + 'cfg_path': 'panns.yaml', + 'ckpt_path': 'cnn14.pdparams', + 'label_file': 'audioset_labels.txt', + }, +} + +model_alias = { + "panns_cnn6": "paddlespeech.cls.models.panns:CNN6", + "panns_cnn10": "paddlespeech.cls.models.panns:CNN10", + "panns_cnn14": "paddlespeech.cls.models.panns:CNN14", +} diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py index 064939a8..df0b6783 100644 --- a/paddlespeech/cli/executor.py +++ b/paddlespeech/cli/executor.py @@ -25,6 +25,8 @@ from typing import Union import paddle from .log import logger +from .utils import download_and_decompress +from .utils import MODEL_HOME class BaseExecutor(ABC): @@ -35,19 +37,8 @@ class BaseExecutor(ABC): def __init__(self): self._inputs = OrderedDict() self._outputs = OrderedDict() - - @abstractmethod - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - - Args: - tag (str): A tag of pretrained model. - - Returns: - os.PathLike: The path on which resources of pretrained model locate. - """ - pass + self.pretrained_models = OrderedDict() + self.model_alias = OrderedDict() @abstractmethod def _init_from_path(self, *args, **kwargs): @@ -227,3 +218,20 @@ class BaseExecutor(ABC): ] for l in loggers: l.disabled = True + + def _get_pretrained_path(self, tag: str) -> os.PathLike: + """ + Download and returns pretrained resources path of current task. + """ + support_models = list(self.pretrained_models.keys()) + assert tag in self.pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( + tag, '\n\t\t'.join(support_models)) + + res_path = os.path.join(MODEL_HOME, tag) + decompressed_path = download_and_decompress(self.pretrained_models[tag], + res_path) + decompressed_path = os.path.abspath(decompressed_path) + logger.info( + 'Use pretrained model stored in: {}'.format(decompressed_path)) + + return decompressed_path diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py index e64fc57d..29d95f79 100644 --- a/paddlespeech/cli/st/infer.py +++ b/paddlespeech/cli/st/infer.py @@ -32,40 +32,24 @@ from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME from ..utils import stats_wrapper +from .pretrained_models import kaldi_bins +from .pretrained_models import model_alias +from .pretrained_models import pretrained_models from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.utility import UpdateConfig __all__ = ["STExecutor"] -pretrained_models = { - "fat_st_ted-en-zh": { - "url": - "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz", - "md5": - "d62063f35a16d91210a71081bd2dd557", - "cfg_path": - "model.yaml", - "ckpt_path": - "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams", - } -} - -model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"} - -kaldi_bins = { - "url": - "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz", - "md5": - "c0682303b3f3393dbf6ed4c4e35a53eb", -} - @cli_register( name="paddlespeech.st", description="Speech translation infer command.") class STExecutor(BaseExecutor): def __init__(self): - super(STExecutor, self).__init__() + super().__init__() + self.model_alias = model_alias + self.pretrained_models = pretrained_models + self.kaldi_bins = kaldi_bins self.parser = argparse.ArgumentParser( prog="paddlespeech.st", add_help=True) @@ -75,7 +59,9 @@ class STExecutor(BaseExecutor): "--model", type=str, default="fat_st_ted", - choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], + choices=[ + tag[:tag.index('-')] for tag in self.pretrained_models.keys() + ], help="Choose model type of st task.") self.parser.add_argument( "--src_lang", @@ -119,28 +105,11 @@ class STExecutor(BaseExecutor): action='store_true', help='Increase logger verbosity of current task.') - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - "Use pretrained model stored in: {}".format(decompressed_path)) - - return decompressed_path - def _set_kaldi_bins(self) -> os.PathLike: """ Download and returns kaldi_bins resources path of current task. """ - decompressed_path = download_and_decompress(kaldi_bins, MODEL_HOME) + decompressed_path = download_and_decompress(self.kaldi_bins, MODEL_HOME) decompressed_path = os.path.abspath(decompressed_path) logger.info("Kaldi_bins stored in: {}".format(decompressed_path)) if "LD_LIBRARY_PATH" in os.environ: @@ -197,7 +166,7 @@ class STExecutor(BaseExecutor): model_conf = self.config model_name = model_type[:model_type.rindex( '_')] # model_type: {model_name}_{dataset} - model_class = dynamic_import(model_name, model_alias) + model_class = dynamic_import(model_name, self.model_alias) self.model = model_class.from_config(model_conf) self.model.eval() diff --git a/paddlespeech/cli/st/pretrained_models.py b/paddlespeech/cli/st/pretrained_models.py new file mode 100644 index 00000000..cc7410d2 --- /dev/null +++ b/paddlespeech/cli/st/pretrained_models.py @@ -0,0 +1,35 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + "fat_st_ted-en-zh": { + "url": + "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz", + "md5": + "d62063f35a16d91210a71081bd2dd557", + "cfg_path": + "model.yaml", + "ckpt_path": + "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams", + } +} + +model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"} + +kaldi_bins = { + "url": + "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz", + "md5": + "c0682303b3f3393dbf6ed4c4e35a53eb", +} diff --git a/paddlespeech/cli/stats/infer.py b/paddlespeech/cli/stats/infer.py index 4ef50449..7cf4f236 100644 --- a/paddlespeech/cli/stats/infer.py +++ b/paddlespeech/cli/stats/infer.py @@ -16,7 +16,6 @@ from typing import List from prettytable import PrettyTable -from ..log import logger from ..utils import cli_register from ..utils import stats_wrapper @@ -27,7 +26,8 @@ model_name_format = { 'cls': 'Model-Sample Rate', 'st': 'Model-Source language-Target language', 'text': 'Model-Task-Language', - 'tts': 'Model-Language' + 'tts': 'Model-Language', + 'vector': 'Model-Sample Rate' } @@ -36,18 +36,18 @@ model_name_format = { description='Get speech tasks support models list.') class StatsExecutor(): def __init__(self): - super(StatsExecutor, self).__init__() + super().__init__() self.parser = argparse.ArgumentParser( prog='paddlespeech.stats', add_help=True) + self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector'] self.parser.add_argument( '--task', type=str, default='asr', - choices=['asr', 'cls', 'st', 'text', 'tts'], + choices=self.task_choices, help='Choose speech task.', required=True) - self.task_choices = ['asr', 'cls', 'st', 'text', 'tts'] def show_support_models(self, pretrained_models: dict): fields = model_name_format[self.task].split("-") @@ -61,73 +61,15 @@ class StatsExecutor(): Command line entry. """ parser_args = self.parser.parse_args(argv) - self.task = parser_args.task - if self.task not in self.task_choices: - logger.error( - "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" - ) + has_exceptions = False + try: + self(parser_args.task) + except Exception as e: + has_exceptions = True + if has_exceptions: return False - - elif self.task == 'asr': - try: - from ..asr.infer import pretrained_models - logger.info( - "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" - ) - self.show_support_models(pretrained_models) - return True - except BaseException: - logger.error("Failed to get the list of ASR pretrained models.") - return False - - elif self.task == 'cls': - try: - from ..cls.infer import pretrained_models - logger.info( - "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" - ) - self.show_support_models(pretrained_models) - return True - except BaseException: - logger.error("Failed to get the list of CLS pretrained models.") - return False - - elif self.task == 'st': - try: - from ..st.infer import pretrained_models - logger.info( - "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" - ) - self.show_support_models(pretrained_models) - return True - except BaseException: - logger.error("Failed to get the list of ST pretrained models.") - return False - - elif self.task == 'text': - try: - from ..text.infer import pretrained_models - logger.info( - "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" - ) - self.show_support_models(pretrained_models) - return True - except BaseException: - logger.error( - "Failed to get the list of TEXT pretrained models.") - return False - - elif self.task == 'tts': - try: - from ..tts.infer import pretrained_models - logger.info( - "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" - ) - self.show_support_models(pretrained_models) - return True - except BaseException: - logger.error("Failed to get the list of TTS pretrained models.") - return False + else: + return True @stats_wrapper def __call__( @@ -138,13 +80,12 @@ class StatsExecutor(): """ self.task = task if self.task not in self.task_choices: - print( - "Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" - ) + print("Please input correct speech task, choices = " + str( + self.task_choices)) elif self.task == 'asr': try: - from ..asr.infer import pretrained_models + from ..asr.pretrained_models import pretrained_models print( "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" ) @@ -154,7 +95,7 @@ class StatsExecutor(): elif self.task == 'cls': try: - from ..cls.infer import pretrained_models + from ..cls.pretrained_models import pretrained_models print( "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" ) @@ -164,7 +105,7 @@ class StatsExecutor(): elif self.task == 'st': try: - from ..st.infer import pretrained_models + from ..st.pretrained_models import pretrained_models print( "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" ) @@ -174,7 +115,7 @@ class StatsExecutor(): elif self.task == 'text': try: - from ..text.infer import pretrained_models + from ..text.pretrained_models import pretrained_models print( "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" ) @@ -184,10 +125,22 @@ class StatsExecutor(): elif self.task == 'tts': try: - from ..tts.infer import pretrained_models + from ..tts.pretrained_models import pretrained_models print( "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" ) self.show_support_models(pretrained_models) except BaseException: print("Failed to get the list of TTS pretrained models.") + + elif self.task == 'vector': + try: + from ..vector.pretrained_models import pretrained_models + print( + "Here is the list of Speaker Recognition pretrained models released by PaddleSpeech that can be used by command line and python API" + ) + self.show_support_models(pretrained_models) + except BaseException: + print( + "Failed to get the list of Speaker Recognition pretrained models." + ) diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py index dcf306c6..69e62e4b 100644 --- a/paddlespeech/cli/text/infer.py +++ b/paddlespeech/cli/text/infer.py @@ -25,58 +25,21 @@ from ...s2t.utils.dynamic_import import dynamic_import from ..executor import BaseExecutor from ..log import logger from ..utils import cli_register -from ..utils import download_and_decompress -from ..utils import MODEL_HOME from ..utils import stats_wrapper +from .pretrained_models import model_alias +from .pretrained_models import pretrained_models +from .pretrained_models import tokenizer_alias __all__ = ['TextExecutor'] -pretrained_models = { - # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". - # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k". - # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: - # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" - "ernie_linear_p7_wudao-punc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz', - 'md5': - '12283e2ddde1797c5d1e57036b512746', - 'cfg_path': - 'ckpt/model_config.json', - 'ckpt_path': - 'ckpt/model_state.pdparams', - 'vocab_file': - 'punc_vocab.txt', - }, - "ernie_linear_p3_wudao-punc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz', - 'md5': - '448eb2fdf85b6a997e7e652e80c51dd2', - 'cfg_path': - 'ckpt/model_config.json', - 'ckpt_path': - 'ckpt/model_state.pdparams', - 'vocab_file': - 'punc_vocab.txt', - }, -} - -model_alias = { - "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear", - "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear", -} - -tokenizer_alias = { - "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer", - "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer", -} - @cli_register(name='paddlespeech.text', description='Text infer command.') class TextExecutor(BaseExecutor): def __init__(self): - super(TextExecutor, self).__init__() + super().__init__() + self.model_alias = model_alias + self.pretrained_models = pretrained_models + self.tokenizer_alias = tokenizer_alias self.parser = argparse.ArgumentParser( prog='paddlespeech.text', add_help=True) @@ -92,7 +55,9 @@ class TextExecutor(BaseExecutor): '--model', type=str, default='ernie_linear_p7_wudao', - choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], + choices=[ + tag[:tag.index('-')] for tag in self.pretrained_models.keys() + ], help='Choose model type of text task.') self.parser.add_argument( '--lang', @@ -131,23 +96,6 @@ class TextExecutor(BaseExecutor): action='store_true', help='Increase logger verbosity of current task.') - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - - return decompressed_path - def _init_from_path(self, task: str='punc', model_type: str='ernie_linear_p7_wudao', @@ -167,12 +115,12 @@ class TextExecutor(BaseExecutor): if cfg_path is None or ckpt_path is None or vocab_file is None: tag = '-'.join([model_type, task, lang]) self.res_path = self._get_pretrained_path(tag) - self.cfg_path = os.path.join(self.res_path, - pretrained_models[tag]['cfg_path']) - self.ckpt_path = os.path.join(self.res_path, - pretrained_models[tag]['ckpt_path']) - self.vocab_file = os.path.join(self.res_path, - pretrained_models[tag]['vocab_file']) + self.cfg_path = os.path.join( + self.res_path, self.pretrained_models[tag]['cfg_path']) + self.ckpt_path = os.path.join( + self.res_path, self.pretrained_models[tag]['ckpt_path']) + self.vocab_file = os.path.join( + self.res_path, self.pretrained_models[tag]['vocab_file']) else: self.cfg_path = os.path.abspath(cfg_path) self.ckpt_path = os.path.abspath(ckpt_path) @@ -187,8 +135,8 @@ class TextExecutor(BaseExecutor): self._punc_list.append(line.strip()) # model - model_class = dynamic_import(model_name, model_alias) - tokenizer_class = dynamic_import(model_name, tokenizer_alias) + model_class = dynamic_import(model_name, self.model_alias) + tokenizer_class = dynamic_import(model_name, self.tokenizer_alias) self.model = model_class( cfg_path=self.cfg_path, ckpt_path=self.ckpt_path) self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0') diff --git a/paddlespeech/cli/text/pretrained_models.py b/paddlespeech/cli/text/pretrained_models.py new file mode 100644 index 00000000..817d3caa --- /dev/null +++ b/paddlespeech/cli/text/pretrained_models.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]". + # e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k". + # Command line and python api use "{model_name}[_{dataset}]" as --model, usage: + # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav" + "ernie_linear_p7_wudao-punc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz', + 'md5': + '12283e2ddde1797c5d1e57036b512746', + 'cfg_path': + 'ckpt/model_config.json', + 'ckpt_path': + 'ckpt/model_state.pdparams', + 'vocab_file': + 'punc_vocab.txt', + }, + "ernie_linear_p3_wudao-punc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz', + 'md5': + '448eb2fdf85b6a997e7e652e80c51dd2', + 'cfg_path': + 'ckpt/model_config.json', + 'ckpt_path': + 'ckpt/model_state.pdparams', + 'vocab_file': + 'punc_vocab.txt', + }, +} + +model_alias = { + "ernie_linear_p7": "paddlespeech.text.models:ErnieLinear", + "ernie_linear_p3": "paddlespeech.text.models:ErnieLinear", +} + +tokenizer_alias = { + "ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer", + "ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer", +} diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py index 1c3fb29f..1c719930 100644 --- a/paddlespeech/cli/tts/infer.py +++ b/paddlespeech/cli/tts/infer.py @@ -29,9 +29,9 @@ from yacs.config import CfgNode from ..executor import BaseExecutor from ..log import logger from ..utils import cli_register -from ..utils import download_and_decompress -from ..utils import MODEL_HOME from ..utils import stats_wrapper +from .pretrained_models import model_alias +from .pretrained_models import pretrained_models from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend.zh_frontend import Frontend @@ -39,299 +39,14 @@ from paddlespeech.t2s.modules.normalizer import ZScore __all__ = ['TTSExecutor'] -pretrained_models = { - # speedyspeech - "speedyspeech_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip', - 'md5': - '6f6fa967b408454b6662c8c00c0027cb', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_30600.pdz', - 'speech_stats': - 'feats_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - 'tones_dict': - 'tone_id_map.txt', - }, - - # fastspeech2 - "fastspeech2_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', - 'md5': - '637d28a5e53aa60275612ba4393d5f22', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_76000.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - }, - "fastspeech2_ljspeech-en": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip', - 'md5': - 'ffed800c93deaf16ca9b3af89bfcd747', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_100000.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - }, - "fastspeech2_aishell3-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip', - 'md5': - 'f4dd4a5f49a4552b77981f544ab3392e', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_96400.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - 'speaker_dict': - 'speaker_id_map.txt', - }, - "fastspeech2_vctk-en": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip', - 'md5': - '743e5024ca1e17a88c5c271db9779ba4', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_66200.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - 'speaker_dict': - 'speaker_id_map.txt', - }, - # tacotron2 - "tacotron2_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip', - 'md5': - '0df4b6f0bcbe0d73c5ed6df8867ab91a', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_30600.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - }, - "tacotron2_ljspeech-en": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip', - 'md5': - '6a5eddd81ae0e81d16959b97481135f3', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_60300.pdz', - 'speech_stats': - 'speech_stats.npy', - 'phones_dict': - 'phone_id_map.txt', - }, - - # pwgan - "pwgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip', - 'md5': - '2e481633325b5bdf0a3823c714d2c117', - 'config': - 'pwg_default.yaml', - 'ckpt': - 'pwg_snapshot_iter_400000.pdz', - 'speech_stats': - 'pwg_stats.npy', - }, - "pwgan_ljspeech-en": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip', - 'md5': - '53610ba9708fd3008ccaf8e99dacbaf0', - 'config': - 'pwg_default.yaml', - 'ckpt': - 'pwg_snapshot_iter_400000.pdz', - 'speech_stats': - 'pwg_stats.npy', - }, - "pwgan_aishell3-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip', - 'md5': - 'd7598fa41ad362d62f85ffc0f07e3d84', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_1000000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - "pwgan_vctk-en": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip', - 'md5': - 'b3da1defcde3e578be71eb284cb89f2c', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_1500000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - # mb_melgan - "mb_melgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', - 'md5': - 'ee5f0604e20091f0d495b6ec4618b90d', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_1000000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - # style_melgan - "style_melgan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip', - 'md5': - '5de2d5348f396de0c966926b8c462755', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_1500000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - # hifigan - "hifigan_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', - 'md5': - 'dd40a3d88dfcf64513fba2f0f961ada6', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_2500000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - "hifigan_ljspeech-en": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip', - 'md5': - '70e9131695decbca06a65fe51ed38a72', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_2500000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - "hifigan_aishell3-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip', - 'md5': - '3bb49bc75032ed12f79c00c8cc79a09a', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_2500000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - "hifigan_vctk-en": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip', - 'md5': - '7da8f88359bca2457e705d924cf27bd4', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_2500000.pdz', - 'speech_stats': - 'feats_stats.npy', - }, - - # wavernn - "wavernn_csmsc-zh": { - 'url': - 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip', - 'md5': - 'ee37b752f09bcba8f2af3b777ca38e13', - 'config': - 'default.yaml', - 'ckpt': - 'snapshot_iter_400000.pdz', - 'speech_stats': - 'feats_stats.npy', - } -} - -model_alias = { - # acoustic model - "speedyspeech": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeech", - "speedyspeech_inference": - "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference", - "fastspeech2": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2", - "fastspeech2_inference": - "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", - "tacotron2": - "paddlespeech.t2s.models.tacotron2:Tacotron2", - "tacotron2_inference": - "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", - # voc - "pwgan": - "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", - "pwgan_inference": - "paddlespeech.t2s.models.parallel_wavegan:PWGInference", - "mb_melgan": - "paddlespeech.t2s.models.melgan:MelGANGenerator", - "mb_melgan_inference": - "paddlespeech.t2s.models.melgan:MelGANInference", - "style_melgan": - "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", - "style_melgan_inference": - "paddlespeech.t2s.models.melgan:StyleMelGANInference", - "hifigan": - "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", - "hifigan_inference": - "paddlespeech.t2s.models.hifigan:HiFiGANInference", - "wavernn": - "paddlespeech.t2s.models.wavernn:WaveRNN", - "wavernn_inference": - "paddlespeech.t2s.models.wavernn:WaveRNNInference", -} - @cli_register( name='paddlespeech.tts', description='Text to Speech infer command.') class TTSExecutor(BaseExecutor): def __init__(self): super().__init__() + self.model_alias = model_alias + self.pretrained_models = pretrained_models self.parser = argparse.ArgumentParser( prog='paddlespeech.tts', add_help=True) @@ -449,22 +164,6 @@ class TTSExecutor(BaseExecutor): action='store_true', help='Increase logger verbosity of current task.') - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """ - Download and returns pretrained resources path of current task. - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format( - tag, '\n\t\t'.join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - return decompressed_path - def _init_from_path( self, am: str='fastspeech2_csmsc', @@ -490,16 +189,15 @@ class TTSExecutor(BaseExecutor): if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: am_res_path = self._get_pretrained_path(am_tag) self.am_res_path = am_res_path - self.am_config = os.path.join(am_res_path, - pretrained_models[am_tag]['config']) + self.am_config = os.path.join( + am_res_path, self.pretrained_models[am_tag]['config']) self.am_ckpt = os.path.join(am_res_path, - pretrained_models[am_tag]['ckpt']) + self.pretrained_models[am_tag]['ckpt']) self.am_stat = os.path.join( - am_res_path, pretrained_models[am_tag]['speech_stats']) + am_res_path, self.pretrained_models[am_tag]['speech_stats']) # must have phones_dict in acoustic self.phones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['phones_dict']) - print("self.phones_dict:", self.phones_dict) + am_res_path, self.pretrained_models[am_tag]['phones_dict']) logger.info(am_res_path) logger.info(self.am_config) logger.info(self.am_ckpt) @@ -509,21 +207,20 @@ class TTSExecutor(BaseExecutor): self.am_stat = os.path.abspath(am_stat) self.phones_dict = os.path.abspath(phones_dict) self.am_res_path = os.path.dirname(os.path.abspath(self.am_config)) - print("self.phones_dict:", self.phones_dict) # for speedyspeech self.tones_dict = None - if 'tones_dict' in pretrained_models[am_tag]: + if 'tones_dict' in self.pretrained_models[am_tag]: self.tones_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['tones_dict']) + am_res_path, self.pretrained_models[am_tag]['tones_dict']) if tones_dict: self.tones_dict = tones_dict # for multi speaker fastspeech2 self.speaker_dict = None - if 'speaker_dict' in pretrained_models[am_tag]: + if 'speaker_dict' in self.pretrained_models[am_tag]: self.speaker_dict = os.path.join( - am_res_path, pretrained_models[am_tag]['speaker_dict']) + am_res_path, self.pretrained_models[am_tag]['speaker_dict']) if speaker_dict: self.speaker_dict = speaker_dict @@ -532,12 +229,12 @@ class TTSExecutor(BaseExecutor): if voc_ckpt is None or voc_config is None or voc_stat is None: voc_res_path = self._get_pretrained_path(voc_tag) self.voc_res_path = voc_res_path - self.voc_config = os.path.join(voc_res_path, - pretrained_models[voc_tag]['config']) - self.voc_ckpt = os.path.join(voc_res_path, - pretrained_models[voc_tag]['ckpt']) + self.voc_config = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['config']) + self.voc_ckpt = os.path.join( + voc_res_path, self.pretrained_models[voc_tag]['ckpt']) self.voc_stat = os.path.join( - voc_res_path, pretrained_models[voc_tag]['speech_stats']) + voc_res_path, self.pretrained_models[voc_tag]['speech_stats']) logger.info(voc_res_path) logger.info(self.voc_config) logger.info(self.voc_ckpt) @@ -588,8 +285,9 @@ class TTSExecutor(BaseExecutor): # model: {model_name}_{dataset} am_name = am[:am.rindex('_')] - am_class = dynamic_import(am_name, model_alias) - am_inference_class = dynamic_import(am_name + '_inference', model_alias) + am_class = dynamic_import(am_name, self.model_alias) + am_inference_class = dynamic_import(am_name + '_inference', + self.model_alias) if am_name == 'fastspeech2': am = am_class( @@ -618,9 +316,9 @@ class TTSExecutor(BaseExecutor): # vocoder # model: {model_name}_{dataset} voc_name = voc[:voc.rindex('_')] - voc_class = dynamic_import(voc_name, model_alias) + voc_class = dynamic_import(voc_name, self.model_alias) voc_inference_class = dynamic_import(voc_name + '_inference', - model_alias) + self.model_alias) if voc_name != 'wavernn': voc = voc_class(**self.voc_config["generator_params"]) voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"]) @@ -735,7 +433,6 @@ class TTSExecutor(BaseExecutor): am_ckpt = args.am_ckpt am_stat = args.am_stat phones_dict = args.phones_dict - print("phones_dict:", phones_dict) tones_dict = args.tones_dict speaker_dict = args.speaker_dict voc = args.voc diff --git a/paddlespeech/cli/tts/pretrained_models.py b/paddlespeech/cli/tts/pretrained_models.py new file mode 100644 index 00000000..65254a93 --- /dev/null +++ b/paddlespeech/cli/tts/pretrained_models.py @@ -0,0 +1,300 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + # speedyspeech + "speedyspeech_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip', + 'md5': + '6f6fa967b408454b6662c8c00c0027cb', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_30600.pdz', + 'speech_stats': + 'feats_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + 'tones_dict': + 'tone_id_map.txt', + }, + + # fastspeech2 + "fastspeech2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip', + 'md5': + '637d28a5e53aa60275612ba4393d5f22', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_76000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "fastspeech2_ljspeech-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip', + 'md5': + 'ffed800c93deaf16ca9b3af89bfcd747', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_100000.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "fastspeech2_aishell3-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip', + 'md5': + 'f4dd4a5f49a4552b77981f544ab3392e', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_96400.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + }, + "fastspeech2_vctk-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip', + 'md5': + '743e5024ca1e17a88c5c271db9779ba4', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_66200.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + 'speaker_dict': + 'speaker_id_map.txt', + }, + # tacotron2 + "tacotron2_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip', + 'md5': + '0df4b6f0bcbe0d73c5ed6df8867ab91a', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_30600.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + "tacotron2_ljspeech-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip', + 'md5': + '6a5eddd81ae0e81d16959b97481135f3', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_60300.pdz', + 'speech_stats': + 'speech_stats.npy', + 'phones_dict': + 'phone_id_map.txt', + }, + + # pwgan + "pwgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip', + 'md5': + '2e481633325b5bdf0a3823c714d2c117', + 'config': + 'pwg_default.yaml', + 'ckpt': + 'pwg_snapshot_iter_400000.pdz', + 'speech_stats': + 'pwg_stats.npy', + }, + "pwgan_ljspeech-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip', + 'md5': + '53610ba9708fd3008ccaf8e99dacbaf0', + 'config': + 'pwg_default.yaml', + 'ckpt': + 'pwg_snapshot_iter_400000.pdz', + 'speech_stats': + 'pwg_stats.npy', + }, + "pwgan_aishell3-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip', + 'md5': + 'd7598fa41ad362d62f85ffc0f07e3d84', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + "pwgan_vctk-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip', + 'md5': + 'b3da1defcde3e578be71eb284cb89f2c', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + # mb_melgan + "mb_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'ee5f0604e20091f0d495b6ec4618b90d', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1000000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + # style_melgan + "style_melgan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip', + 'md5': + '5de2d5348f396de0c966926b8c462755', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_1500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + # hifigan + "hifigan_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip', + 'md5': + 'dd40a3d88dfcf64513fba2f0f961ada6', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + "hifigan_ljspeech-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip', + 'md5': + '70e9131695decbca06a65fe51ed38a72', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + "hifigan_aishell3-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip', + 'md5': + '3bb49bc75032ed12f79c00c8cc79a09a', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + "hifigan_vctk-en": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip', + 'md5': + '7da8f88359bca2457e705d924cf27bd4', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_2500000.pdz', + 'speech_stats': + 'feats_stats.npy', + }, + + # wavernn + "wavernn_csmsc-zh": { + 'url': + 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip', + 'md5': + 'ee37b752f09bcba8f2af3b777ca38e13', + 'config': + 'default.yaml', + 'ckpt': + 'snapshot_iter_400000.pdz', + 'speech_stats': + 'feats_stats.npy', + } +} + +model_alias = { + # acoustic model + "speedyspeech": + "paddlespeech.t2s.models.speedyspeech:SpeedySpeech", + "speedyspeech_inference": + "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference", + "fastspeech2": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2", + "fastspeech2_inference": + "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference", + "tacotron2": + "paddlespeech.t2s.models.tacotron2:Tacotron2", + "tacotron2_inference": + "paddlespeech.t2s.models.tacotron2:Tacotron2Inference", + # voc + "pwgan": + "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator", + "pwgan_inference": + "paddlespeech.t2s.models.parallel_wavegan:PWGInference", + "mb_melgan": + "paddlespeech.t2s.models.melgan:MelGANGenerator", + "mb_melgan_inference": + "paddlespeech.t2s.models.melgan:MelGANInference", + "style_melgan": + "paddlespeech.t2s.models.melgan:StyleMelGANGenerator", + "style_melgan_inference": + "paddlespeech.t2s.models.melgan:StyleMelGANInference", + "hifigan": + "paddlespeech.t2s.models.hifigan:HiFiGANGenerator", + "hifigan_inference": + "paddlespeech.t2s.models.hifigan:HiFiGANInference", + "wavernn": + "paddlespeech.t2s.models.wavernn:WaveRNN", + "wavernn_inference": + "paddlespeech.t2s.models.wavernn:WaveRNNInference", +} diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index 68e832ac..1dff6edb 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -27,45 +27,24 @@ from yacs.config import CfgNode from ..executor import BaseExecutor from ..log import logger from ..utils import cli_register -from ..utils import download_and_decompress -from ..utils import MODEL_HOME from ..utils import stats_wrapper +from .pretrained_models import model_alias +from .pretrained_models import pretrained_models from paddleaudio.backends import load as load_audio from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.modules.sid_model import SpeakerIdetification -pretrained_models = { - # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]". - # e.g. "ecapatdnn_voxceleb12-16k". - # Command line and python api use "{model_name}[-{dataset}]" as --model, usage: - # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav" - "ecapatdnn_voxceleb12-16k": { - 'url': - 'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz', - 'md5': - 'cc33023c54ab346cd318408f43fcaf95', - 'cfg_path': - 'conf/model.yaml', # the yaml config path - 'ckpt_path': - 'model/model', # the format is ${dir}/{model_name}, - # so the first 'model' is dir, the second 'model' is the name - # this means we have a model stored as model/model.pdparams - }, -} - -model_alias = { - "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn", -} - @cli_register( name="paddlespeech.vector", description="Speech to vector embedding infer command.") class VectorExecutor(BaseExecutor): def __init__(self): - super(VectorExecutor, self).__init__() + super().__init__() + self.model_alias = model_alias + self.pretrained_models = pretrained_models self.parser = argparse.ArgumentParser( prog="paddlespeech.vector", add_help=True) @@ -128,8 +107,8 @@ class VectorExecutor(BaseExecutor): Returns: bool: - False: some audio occurs error - True: all audio process success + False: some audio occurs error + True: all audio process success """ # stage 0: parse the args and get the required args parser_args = self.parser.parse_args(argv) @@ -289,32 +268,6 @@ class VectorExecutor(BaseExecutor): return res - def _get_pretrained_path(self, tag: str) -> os.PathLike: - """get the neural network path from the pretrained model list - we stored all the pretained mode in the variable `pretrained_models` - - Args: - tag (str): model tag in the pretrained model list - - Returns: - os.PathLike: the downloaded pretrained model path in the disk - """ - support_models = list(pretrained_models.keys()) - assert tag in pretrained_models, \ - 'The model "{}" you want to use has not been supported,'\ - 'please choose other models.\n' \ - 'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models)) - - res_path = os.path.join(MODEL_HOME, tag) - decompressed_path = download_and_decompress(pretrained_models[tag], - res_path) - - decompressed_path = os.path.abspath(decompressed_path) - logger.info( - 'Use pretrained model stored in: {}'.format(decompressed_path)) - - return decompressed_path - def _init_from_path(self, model_type: str='ecapatdnn_voxceleb12', sample_rate: int=16000, @@ -350,10 +303,11 @@ class VectorExecutor(BaseExecutor): res_path = self._get_pretrained_path(tag) self.res_path = res_path - self.cfg_path = os.path.join(res_path, - pretrained_models[tag]['cfg_path']) + self.cfg_path = os.path.join( + res_path, self.pretrained_models[tag]['cfg_path']) self.ckpt_path = os.path.join( - res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams') + res_path, + self.pretrained_models[tag]['ckpt_path'] + '.pdparams') else: # get the model from disk self.cfg_path = os.path.abspath(cfg_path) @@ -373,7 +327,7 @@ class VectorExecutor(BaseExecutor): logger.info("start to dynamic import the model class") model_name = model_type[:model_type.rindex('_')] logger.info(f"model name {model_name}") - model_class = dynamic_import(model_name, model_alias) + model_class = dynamic_import(model_name, self.model_alias) model_conf = self.config.model backbone = model_class(**model_conf) model = SpeakerIdetification( diff --git a/paddlespeech/cli/vector/pretrained_models.py b/paddlespeech/cli/vector/pretrained_models.py new file mode 100644 index 00000000..686a22d8 --- /dev/null +++ b/paddlespeech/cli/vector/pretrained_models.py @@ -0,0 +1,36 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +pretrained_models = { + # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]". + # e.g. "ecapatdnn_voxceleb12-16k". + # Command line and python api use "{model_name}[-{dataset}]" as --model, usage: + # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav" + "ecapatdnn_voxceleb12-16k": { + 'url': + 'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz', + 'md5': + 'cc33023c54ab346cd318408f43fcaf95', + 'cfg_path': + 'conf/model.yaml', # the yaml config path + 'ckpt_path': + 'model/model', # the format is ${dir}/{model_name}, + # so the first 'model' is dir, the second 'model' is the name + # this means we have a model stored as model/model.pdparams + }, +} + +model_alias = { + "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn", +} diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh index 96ab84d6..87c24b09 100755 --- a/tests/unit/cli/test_cli.sh +++ b/tests/unit/cli/test_cli.sh @@ -1,5 +1,6 @@ #!/bin/bash set -e + # Audio classification wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav paddlespeech cls --input ./cat.wav --topk 10 @@ -28,26 +29,16 @@ paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨 paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." - # Speech Translation (only support linux) paddlespeech st --input ./en.wav - -# batch process -echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts - -# shell pipeline -paddlespeech asr --input ./zh.wav | paddlespeech text --task punc - -# stats -paddlespeech stats --task asr -paddlespeech stats --task tts -paddlespeech stats --task cls - # Speaker Verification wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav paddlespeech vector --task spk --input 85236145389.wav +# batch process +echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts + echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job paddlespeech vector --task spk --input vec.job @@ -55,4 +46,13 @@ echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector - rm 85236145389.wav rm vec.job +# shell pipeline +paddlespeech asr --input ./zh.wav | paddlespeech text --task punc +# stats +paddlespeech stats --task asr +paddlespeech stats --task tts +paddlespeech stats --task cls +paddlespeech stats --task text +paddlespeech stats --task vector +paddlespeech stats --task st From 9e41ac8550b5f53b77ce3656e3561c58e0f25a82 Mon Sep 17 00:00:00 2001 From: lym0302 Date: Tue, 19 Apr 2022 15:51:44 +0800 Subject: [PATCH 18/18] code format, test=doc --- paddlespeech/server/engine/tts/online/tts_engine.py | 1 - 1 file changed, 1 deletion(-) diff --git a/paddlespeech/server/engine/tts/online/tts_engine.py b/paddlespeech/server/engine/tts/online/tts_engine.py index a84644e7..c9135b88 100644 --- a/paddlespeech/server/engine/tts/online/tts_engine.py +++ b/paddlespeech/server/engine/tts/online/tts_engine.py @@ -479,7 +479,6 @@ class TTSEngine(BaseEngine): def __init__(self, name=None): """Initialize TTS server engine """ - #super(TTSEngine, self).__init__() super().__init__() def init(self, config: dict) -> bool: