Merge pull request #1616 from zh794390558/spx

[speechx] more comment of code
4 years ago · b75268c588
parent 2ea578e861 84d712d493
commit b75268c588
22 changed files with 226 additions and 33 deletions
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@ -20,12 +20,12 @@ of each audio file in the data set.
 """
 import argparse
 import codecs
 import distutils.util
 import io
 import json
 import os
 from multiprocessing.pool import Pool
 import distutils.util
 import soundfile
 from utils.utility import download
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@ -16,8 +16,8 @@ import os
 import librosa
 import numpy as np
 from config import DEFAULT_TABLE
 from logs import LOGGER
 from paddlespeech.cli import VectorExecutor
 vector_executor = VectorExecutor()
--- a/demos/audio_searching/src/operations/load.py
+++ b/demos/audio_searching/src/operations/load.py
@ -26,9 +26,8 @@ def get_audios(path):
    """
    supported_formats = [".wav", ".mp3", ".ogg", ".flac", ".m4a"]
    return [
-        item
+        item for sublist in [[os.path.join(dir, file) for file in files]
-        for sublist in [[os.path.join(dir, file) for file in files]
+                             for dir, _, files in list(os.walk(path))]
                        for dir, _, files in list(os.walk(path))]
        for item in sublist if os.path.splitext(item)[1] in supported_formats
    ]
--- a/examples/ami/sd0/local/ami_prepare.py
+++ b/examples/ami/sd0/local/ami_prepare.py
@ -24,11 +24,11 @@ import json
 import logging
 import os
 import xml.etree.ElementTree as et
 from distutils.util import strtobool
 from ami_splits import get_AMI_split
 from dataio import load_pkl
 from dataio import save_pkl
 from distutils.util import strtobool
 logger = logging.getLogger(__name__)
 SAMPLERATE = 16000
--- a/paddlespeech/s2t/decoders/recog_bin.py
+++ b/paddlespeech/s2t/decoders/recog_bin.py
@ -17,10 +17,10 @@ import logging
 import os
 import random
 import sys
 from distutils.util import strtobool
 import configargparse
 import numpy as np
 from distutils.util import strtobool
 def get_parser():
--- a/paddlespeech/s2t/utils/cli_utils.py
+++ b/paddlespeech/s2t/utils/cli_utils.py
@ -14,9 +14,9 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import sys
 from collections.abc import Sequence
 from distutils.util import strtobool as dist_strtobool
 import numpy
 from distutils.util import strtobool as dist_strtobool
 def strtobool(x):
--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains common utility functions."""
 import distutils.util
 import math
 import os
 import random
@ -21,6 +20,7 @@ from contextlib import contextmanager
 from pprint import pformat
 from typing import List
 import distutils.util
 import numpy as np
 import paddle
 import soundfile
--- a/paddlespeech/vector/cluster/diarization.py
+++ b/paddlespeech/vector/cluster/diarization.py
@ -18,11 +18,11 @@ A few sklearn functions are modified in this script as per requirement.
 """
 import argparse
 import warnings
 from distutils.util import strtobool
 import numpy as np
 import scipy
 import sklearn
 from distutils.util import strtobool
 from scipy import sparse
 from scipy.sparse.csgraph import connected_components
 from scipy.sparse.csgraph import laplacian as csgraph_laplacian
--- a/speechx/README.md
+++ b/speechx/README.md
@ -5,7 +5,7 @@
 We develop under:
 * docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7
 * os - Ubuntu 16.04.7 LTS
-* gcc/g++ - 8.2.0
+* ** gcc/g++/gfortran - 8.2.0 **
 * cmake - 3.16.0
 > We make sure all things work fun under docker, and recommend using it to develop and deploy.
@ -29,6 +29,8 @@ nvidia-docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspac
 2. Build `speechx` and `examples`.
 > Do not source venv.
 ```
 pushd /path/to/speechx
 ./build.sh
--- a/speechx/build.sh
+++ b/speechx/build.sh
@ -3,7 +3,6 @@
 # the build script had verified in the paddlepaddle docker image.
 # please follow the instruction below to install PaddlePaddle image.
 # https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html 
 boost_SOURCE_DIR=$PWD/fc_patch/boost-src
 if [ ! -d ${boost_SOURCE_DIR} ]; then wget -c https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz 
  tar xzfv boost_1_75_0.tar.gz
@ -23,6 +22,6 @@ cd build
 cmake .. -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
 #cmake .. 
-make -j1
+make -j10
 cd -
--- a/speechx/cmake/FindGFortranLibs.cmake
+++ b/speechx/cmake/FindGFortranLibs.cmake
@ -0,0 +1,145 @@
 #.rst:
 # FindGFortranLibs
 # --------
 #  https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
 #  https://enccs.github.io/cmake-workshop/cxx-fortran/
 #
 # Find gcc Fortran compiler & library paths
 #
 # The module defines the following variables:
 #
 # ::
 #
 #
 #   GFORTRANLIBS_FOUND - true if system has gfortran
 #   LIBGFORTRAN_LIBRARIES - path to libgfortran
 #   LIBQUADMATH_LIBRARIES - path to libquadmath
 #   GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
 #   GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
 #   LIBGOMP_LIBRARIES - path to libgomp
 #   LIBGOMP_INCLUDE_DIR - directory containing omp.h header
 #   GFORTRAN_VERSION_STRING - version of gfortran found
 #
 set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
 if(NOT CMAKE_REQUIRED_QUIET)
  message(STATUS "Looking for gfortran related libraries...")
 endif()
 enable_language(Fortran)
 if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
  # Basically, call "gfortran -v" to dump compiler info to the string
  # GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
  message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
  execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
    GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
  # For debugging
  message(STATUS "'gfortran -v' returned:")
  message(STATUS "${GFORTRAN_VERBOSE_STR}")
  # Detect gfortran version
  string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
  string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
  message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
  unset(GFORTRAN_VER_STR)
  set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
  set(REPLACE_REGEX "([^\t\n ]+)")
  # Find architecture for compiler
  string(REGEX MATCH "Target: [^\t\n ]+"
    GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
  message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
  string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
    GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
  message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
  unset(GFORTRAN_ARCH_STR)
  # Find install prefix, if it exists; if not, use default
  string(REGEX MATCH  "--prefix=[^\t\n ]+[\t\n ]+"
    GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
  if(NOT GFORTRAN_PREFIX_STR)
    message(STATUS "Detected default gfortran prefix")
    set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
  else()
    string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
      GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
  endif()
  message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
  unset(GFORTRAN_PREFIX_STR)
  # Find install exec-prefix, if it exists; if not, use default
  string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
    GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
  if(NOT GFORTRAN_EXEC_PREFIX_STR)
    message(STATUS "Detected default gfortran exec-prefix")
    set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
  else()
    string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
      GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
  endif()
  message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
  UNSET(GFORTRAN_EXEC_PREFIX_STR)
  # Find library directory and include directory, if library directory specified
  string(REGEX MATCH "--libdir=[^\t\n ]+"
    GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
  if(NOT GFORTRAN_LIB_DIR_STR)
    message(STATUS "Found --libdir flag -- not found")
    message(STATUS "Using default gfortran library & include directory paths")
    set(GFORTRAN_LIBRARIES_DIR
      "${GFORTRAN_EXEC_PREFIX_DIR}/lib/gcc/${GFORTRAN_ARCH}/${GFORTRAN_VERSION_STRING}")
    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/include")
  else()
    message(STATUS "Found --libdir flag -- yes")
    string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
      GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
  endif()
  message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
  message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
  unset(GFORTRAN_LIB_DIR_STR)
  # There are lots of other build options for gcc & gfortran. For now, the
  # options implemented above should cover a lot of common use cases.
  # Clean up be deleting the output string from "gfortran -v"
  unset(GFORTRAN_VERBOSE_STR)
  # Find paths for libgfortran, libquadmath, libgomp
  # libgomp needed for OpenMP support without Clang
  find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
    HINTS ${GFORTRAN_LIBRARIES_DIR})
  find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
    HINTS ${GFORTRAN_LIBRARIES_DIR})
  find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
    HINTS ${GFORTRAN_LIBRARIES_DIR})
  # Find OpenMP headers
  find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
 else()
  message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
 endif()
 include(FindPackageHandleStandardArgs)
 # Required: libgfortran, libquadmath, path for gfortran libraries
 # Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
 find_package_handle_standard_args(GFortranLibs
  REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
  VERSION_VAR GFORTRAN_VERSION_STRING)
 if(GFORTRANLIBS_FOUND)
  message(STATUS "Looking for gfortran libraries -- found")
  message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
 else()
  message(STATUS "Looking for gfortran libraries -- not found")
 endif()
 mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
  LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
  GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
 # FindGFortranLIBS.cmake ends here
--- a/speechx/cmake/external/openblas.cmake
+++ b/speechx/cmake/external/openblas.cmake
@ -7,6 +7,27 @@ set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix)
 # OPENBLAS  https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575
 # ######################################################################################################################
 enable_language(Fortran)
 include(FortranCInterface)
 # # Clang doesn't have a Fortran compiler in its suite (yet),
 # # so detect libraries for gfortran; we need equivalents to
 # # libgfortran and libquadmath, which are implicitly
 # # linked by flags in CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES
 # include(FindGFortranLibs REQUIRED)
 # # Add directory containing libgfortran and libquadmath to
 # # linker. Should also contain libgomp, if not using
 # # Intel OpenMP runtime
 # link_directories(${GFORTRAN_LIBRARIES_DIR})
 # # gfortan dir in the docker.
 # link_directories(/usr/local/gcc-8.2/lib64)
 # # if you are working with C and Fortran
 # FortranCInterface_VERIFY()
 # # if you are working with C++ and Fortran
 # FortranCInterface_VERIFY(CXX)
 #TODO: switch to CPM
 include(GNUInstallDirs)
 ExternalProject_Add(
--- a/speechx/cmake/external/openfst.cmake
+++ b/speechx/cmake/external/openfst.cmake
@ -1,13 +1,14 @@
 include(FetchContent)
 set(openfst_PREFIX_DIR ${fc_patch}/openfst)
 set(openfst_SOURCE_DIR ${fc_patch}/openfst-src)
 set(openfst_BINARY_DIR ${fc_patch}/openfst-build)
 ExternalProject_Add(openfst
  URL               https://github.com/mjansche/openfst/archive/refs/tags/1.7.2.zip
  URL_HASH          SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6
-#   #PREFIX            ${openfst_PREFIX_DIR} 
+  PREFIX            ${openfst_PREFIX_DIR} 
-#   SOURCE_DIR        ${openfst_SOURCE_DIR}
+  SOURCE_DIR        ${openfst_SOURCE_DIR}
-#   BINARY_DIR        ${openfst_BINARY_DIR}
+  BINARY_DIR        ${openfst_BINARY_DIR}
  CONFIGURE_COMMAND ${openfst_SOURCE_DIR}/configure --prefix=${openfst_PREFIX_DIR}
                      "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
                      "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/feature_cache.cc
@ -41,6 +41,7 @@ void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
 // pop feature chunk
 bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
    kaldi::Timer timer;
    std::unique_lock<std::mutex> lock(mutex_);
    while (cache_.empty() && base_extractor_->IsFinished() == false) {
        ready_read_condition_.wait(lock);
@ -64,10 +65,13 @@ bool FeatureCache::Compute() {
    // compute and feed
    Vector<BaseFloat> feature_chunk;
    bool result = base_extractor_->Read(&feature_chunk);
    std::unique_lock<std::mutex> lock(mutex_);
    while (cache_.size() >= max_size_) {
        ready_feed_condition_.wait(lock);
    }
    // feed cache
    if (feature_chunk.Dim() != 0) {
        cache_.push(feature_chunk);
    }
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/feature_cache.h
@ -24,17 +24,24 @@ class FeatureCache : public FeatureExtractorInterface {
    explicit FeatureCache(
        int32 max_size = kint16max,
        std::unique_ptr<FeatureExtractorInterface> base_extractor = NULL);
    // Feed feats or waves
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
    // feats dim = num_frames * feature_dim
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
    // feature cache only cache feature which from base extractor
    virtual size_t Dim() const { return base_extractor_->Dim(); }
    virtual void SetFinished() {
        base_extractor_->SetFinished();
        // read the last chunk data
        Compute();
    }
    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
    virtual void Reset() {
        base_extractor_->Reset();
        while (!cache_.empty()) {
@ -45,12 +52,14 @@ class FeatureCache : public FeatureExtractorInterface {
  private:
    bool Compute();
    std::mutex mutex_;
    size_t max_size_;
    std::queue<kaldi::Vector<BaseFloat>> cache_;
    std::unique_ptr<FeatureExtractorInterface> base_extractor_;
    std::mutex mutex_;
    std::queue<kaldi::Vector<BaseFloat>> cache_;
    std::condition_variable ready_feed_condition_;
    std::condition_variable ready_read_condition_;
    // DISALLOW_COPY_AND_ASSGIN(FeatureCache);
 };
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/feature_extractor_interface.h
@ -21,17 +21,26 @@ namespace ppspeech {
 class FeatureExtractorInterface {
  public:
-    // accept input data, accept feature or raw waves which decided
+    // Feed inputs: features(2D saved in 1D) or waveforms(1D).
    // by the base_extractor
    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
-    // get the processed result
+
-    // the length of output = feature_row * feature_dim,
+    // Fetch processed data: features or waveforms.
-    // the Matrix is squashed into Vector
+    // For features(2D saved in 1D), the Matrix is squashed into Vector,
    //    the length of output = feature_row * feature_dim.
    // For waveforms(1D), samples saved in vector.
    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs) = 0;
-    // the Dim is the feature dim
+
    // Dim is the feature dim. For waveforms(1D), Dim is zero; else is specific,
    // e.g 80 for fbank.
    virtual size_t Dim() const = 0;
    // End Flag for Streaming Data.
    virtual void SetFinished() = 0;
    // whether is end of Streaming Data.
    virtual bool IsFinished() const = 0;
    // Reset to start state.
    virtual void Reset() = 0;
 };
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/linear_spectrogram.h
@ -23,12 +23,14 @@ namespace ppspeech {
 struct LinearSpectrogramOptions {
    kaldi::FrameExtractionOptions frame_opts;
-    kaldi::BaseFloat streaming_chunk;
+    kaldi::BaseFloat streaming_chunk;  // second
    LinearSpectrogramOptions() : streaming_chunk(0.36), frame_opts() {}
    void Register(kaldi::OptionsItf* opts) {
-        opts->Register(
+        opts->Register("streaming-chunk",
-            "streaming-chunk", &streaming_chunk, "streaming chunk size");
+                       &streaming_chunk,
                       "streaming chunk size, default: 0.36 sec");
        frame_opts.Register(opts);
    }
 };
--- a/utils/DER.py
+++ b/utils/DER.py
@ -26,9 +26,9 @@ import argparse
 import os
 import re
 import subprocess
 from distutils.util import strtobool
 import numpy as np
 from distutils.util import strtobool
 FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
 SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")
--- a/utils/addjson.py
+++ b/utils/addjson.py
@ -10,8 +10,8 @@ import codecs
 import json
 import logging
 import sys
 from distutils.util import strtobool
 from distutils.util import strtobool
 from espnet.utils.cli_utils import get_commandline_args
 is_python2 = sys.version_info[0] == 2
--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 import argparse
 import logging
 from distutils.util import strtobool
 import kaldiio
 import numpy
 from distutils.util import strtobool
 from paddlespeech.s2t.transform.cmvn import CMVN
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import logging
 from distutils.util import strtobool
 from paddlespeech.s2t.transform.transformation import Transformation
--- a/utils/merge_scp2json.py
+++ b/utils/merge_scp2json.py
@ -5,9 +5,10 @@ import codecs
 import json
 import logging
 import sys
 from distutils.util import strtobool
 from io import open
 from distutils.util import strtobool
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 PY2 = sys.version_info[0] == 2