From f8caaf46c8c35dbecb879cc2d4acea0de13bb45d Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Tue, 27 Dec 2022 16:30:57 +0800 Subject: [PATCH] refactor cmake, rm absl/linsndfile, add strings unittest (#2765) --- speechx/CMakeLists.txt | 54 ++++----- speechx/README.md | 8 ++ speechx/cmake/gtest.cmake | 8 +- speechx/cmake/openfst.cmake | 2 + speechx/cmake/system.cmake | 106 ++++++++++++++++++ speechx/speechx/asr/decoder/CMakeLists.txt | 2 +- .../decoder/ctc_prefix_beam_search_decoder.cc | 1 - .../ctc_prefix_beam_search_decoder_main.cc | 1 - speechx/speechx/asr/nnet/CMakeLists.txt | 2 +- speechx/speechx/asr/nnet/ds2_nnet.cc | 12 +- .../asr/server/websocket/CMakeLists.txt | 2 +- speechx/speechx/common/utils/CMakeLists.txt | 16 ++- speechx/speechx/common/utils/math.cc | 5 +- speechx/speechx/common/utils/strings.cc | 50 +++++++++ speechx/speechx/common/utils/strings.h | 26 +++++ speechx/speechx/common/utils/strings_test.cc | 35 ++++++ 16 files changed, 283 insertions(+), 47 deletions(-) create mode 100644 speechx/cmake/system.cmake create mode 100644 speechx/speechx/common/utils/strings.cc create mode 100644 speechx/speechx/common/utils/strings.h create mode 100644 speechx/speechx/common/utils/strings_test.cc diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt index 6b957160..ed5c38f0 100644 --- a/speechx/CMakeLists.txt +++ b/speechx/CMakeLists.txt @@ -1,19 +1,28 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR) -project(paddlespeech VERSION 0.1) - set(CMAKE_PROJECT_INCLUDE_BEFORE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/EnableCMP0048.cmake") +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") + +include(system) + +# Ninja Generator will set CMAKE_BUILD_TYPE to Debug +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE + "Release" + CACHE + STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +project(paddlespeech VERSION 0.1) + set(CMAKE_VERBOSE_MAKEFILE on) # set std-14 set(CMAKE_CXX_STANDARD 14) -# cmake dir -set(speechx_cmake_dir ${PROJECT_SOURCE_DIR}/cmake) - -# Modules -list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}) include(FetchContent) include(ExternalProject) @@ -33,6 +42,7 @@ SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall ############################################################################### option(TEST_DEBUG "option for debug" OFF) option(USE_PROFILING "enable c++ profling" OFF) +option(WITH_TESTING "unit test" ON) option(USING_U2 "compile u2 model." ON) option(USING_DS2 "compile with ds2 model." ON) @@ -42,26 +52,10 @@ option(USING_GPU "u2 compute on GPU." OFF) ############################################################################### # Include third party ############################################################################### -# example for include third party -# FetchContent_MakeAvailable was not added until CMake 3.14 -# FetchContent_MakeAvailable() -# include_directories() - -# gflags include(gflags) -# glog include(glog) -# gtest -include(gtest) - -# ABSEIL-CPP -include(absl) - -# libsndfile -include(libsndfile) - # boost # include(boost) # not work set(boost_SOURCE_DIR ${fc_patch}/boost-src) @@ -87,6 +81,11 @@ add_dependencies(openfst gflags glog) # paddle lib include(paddleinference) +# gtest +if(WITH_TESTING) + include(gtest) # download, build, install gtest +endif() + # python/pybind11/threads find_package(Threads REQUIRED) # https://cmake.org/cmake/help/latest/module/FindPython3.html#module:FindPython3 @@ -165,15 +164,6 @@ message(STATUS PADDLE_LIB_DIRS= ${PADDLE_LIB_DIRS}) ############################################################################### # Add local library ############################################################################### -# system lib -#find_package() -# if dir have CmakeLists.txt -#add_subdirectory(speechx) -# if dir do not have CmakeLists.txt -#add_library(lib_name STATIC file.cc) -#target_link_libraries(lib_name item0 item1) -#add_dependencies(lib_name depend-target) - set(SPEECHX_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/speechx) add_subdirectory(speechx) diff --git a/speechx/README.md b/speechx/README.md index 5d4b5845..70136ea0 100644 --- a/speechx/README.md +++ b/speechx/README.md @@ -113,3 +113,11 @@ apt-get install gfortran-8 4. `Undefined reference to '_gfortran_concat_string'` using gcc 8.2, gfortran 8.2. + +5. `./boost/python/detail/wrap_python.hpp:57:11: fatal error: pyconfig.h: No such file or directory` + +``` +apt-get install python3-dev +``` + +for more info please see [here](https://github.com/okfn/piati/issues/65). diff --git a/speechx/cmake/gtest.cmake b/speechx/cmake/gtest.cmake index 1ea8ed0b..365f25cf 100644 --- a/speechx/cmake/gtest.cmake +++ b/speechx/cmake/gtest.cmake @@ -1,3 +1,4 @@ + include(FetchContent) FetchContent_Declare( gtest @@ -6,4 +7,9 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(gtest) -include_directories(${gtest_BINARY_DIR} ${gtest_SOURCE_DIR}/src) \ No newline at end of file +include_directories(${gtest_BINARY_DIR} ${gtest_SOURCE_DIR}/src) + + +if(WITH_TESTING) + enable_testing() +endif() \ No newline at end of file diff --git a/speechx/cmake/openfst.cmake b/speechx/cmake/openfst.cmake index 07c33a74..8861f4f4 100644 --- a/speechx/cmake/openfst.cmake +++ b/speechx/cmake/openfst.cmake @@ -25,3 +25,5 @@ ExternalProject_Add(openfst ) link_directories(${openfst_PREFIX_DIR}/lib) include_directories(${openfst_PREFIX_DIR}/include) +message(STATUS "OpenFST inc dir: ${openfst_PREFIX_DIR}/include") +message(STATUS "OpenFST lib dir: ${openfst_PREFIX_DIR}/lib") \ No newline at end of file diff --git a/speechx/cmake/system.cmake b/speechx/cmake/system.cmake new file mode 100644 index 00000000..580e07bb --- /dev/null +++ b/speechx/cmake/system.cmake @@ -0,0 +1,106 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Detects the OS and sets appropriate variables. +# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is +# building for, but the host processor name like centos is necessary +# in some scenes to distinguish system for customization. +# +# for instance, protobuf libs path is /lib64 +# on CentOS, but /lib on other systems. + +if(UNIX AND NOT APPLE) + # except apple from nix*Os family + set(LINUX TRUE) +endif() + +if(WIN32) + set(HOST_SYSTEM "win32") +else() + if(APPLE) + set(HOST_SYSTEM "macosx") + exec_program( + sw_vers ARGS + -productVersion + OUTPUT_VARIABLE HOST_SYSTEM_VERSION) + string(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}") + if(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET}) + # Set cache variable - end user may change this during ccmake or cmake-gui configure. + set(CMAKE_OSX_DEPLOYMENT_TARGET + ${MACOS_VERSION} + CACHE + STRING + "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value." + ) + endif() + set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security") + else() + + if(EXISTS "/etc/issue") + file(READ "/etc/issue" LINUX_ISSUE) + if(LINUX_ISSUE MATCHES "CentOS") + set(HOST_SYSTEM "centos") + elseif(LINUX_ISSUE MATCHES "Debian") + set(HOST_SYSTEM "debian") + elseif(LINUX_ISSUE MATCHES "Ubuntu") + set(HOST_SYSTEM "ubuntu") + elseif(LINUX_ISSUE MATCHES "Red Hat") + set(HOST_SYSTEM "redhat") + elseif(LINUX_ISSUE MATCHES "Fedora") + set(HOST_SYSTEM "fedora") + endif() + + string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION + "${LINUX_ISSUE}") + endif() + + if(EXISTS "/etc/redhat-release") + file(READ "/etc/redhat-release" LINUX_ISSUE) + if(LINUX_ISSUE MATCHES "CentOS") + set(HOST_SYSTEM "centos") + endif() + endif() + + if(NOT HOST_SYSTEM) + set(HOST_SYSTEM ${CMAKE_SYSTEM_NAME}) + endif() + + endif() +endif() + +# query number of logical cores +cmake_host_system_information(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES) + +mark_as_advanced(HOST_SYSTEM CPU_CORES) + +message( + STATUS + "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}") +message(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") + +# external dependencies log output +set(EXTERNAL_PROJECT_LOG_ARGS + LOG_DOWNLOAD + 0 # Wrap download in script to log output + LOG_UPDATE + 1 # Wrap update in script to log output + LOG_CONFIGURE + 1 # Wrap configure in script to log output + LOG_BUILD + 0 # Wrap build in script to log output + LOG_TEST + 1 # Wrap test in script to log output + LOG_INSTALL + 0 # Wrap install in script to log output +) \ No newline at end of file diff --git a/speechx/speechx/asr/decoder/CMakeLists.txt b/speechx/speechx/asr/decoder/CMakeLists.txt index f0fd32ba..93014fb9 100644 --- a/speechx/speechx/asr/decoder/CMakeLists.txt +++ b/speechx/speechx/asr/decoder/CMakeLists.txt @@ -19,7 +19,7 @@ if (USING_U2) endif() add_library(decoder STATIC ${srcs}) -target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder absl::strings) +target_link_libraries(decoder PUBLIC kenlm utils fst frontend nnet kaldi-decoder) # test if (USING_DS2) diff --git a/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.cc b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.cc index 15dbd7e9..2cef4972 100644 --- a/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.cc +++ b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder.cc @@ -17,7 +17,6 @@ #include "decoder/ctc_prefix_beam_search_decoder.h" -#include "absl/strings/str_join.h" #include "base/common.h" #include "decoder/ctc_beam_search_opt.h" #include "decoder/ctc_prefix_beam_search_score.h" diff --git a/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc index c59b1f2e..31c8b19e 100644 --- a/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc +++ b/speechx/speechx/asr/decoder/ctc_prefix_beam_search_decoder_main.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "absl/strings/str_split.h" #include "base/common.h" #include "decoder/ctc_prefix_beam_search_decoder.h" #include "frontend/audio/data_cache.h" diff --git a/speechx/speechx/asr/nnet/CMakeLists.txt b/speechx/speechx/asr/nnet/CMakeLists.txt index 43566616..27081086 100644 --- a/speechx/speechx/asr/nnet/CMakeLists.txt +++ b/speechx/speechx/asr/nnet/CMakeLists.txt @@ -9,7 +9,7 @@ if(USING_U2) endif() add_library(nnet STATIC ${srcs}) -target_link_libraries(nnet absl::strings) +target_link_libraries(nnet utils) if(USING_U2) target_compile_options(nnet PUBLIC ${PADDLE_COMPILE_FLAGS}) diff --git a/speechx/speechx/asr/nnet/ds2_nnet.cc b/speechx/speechx/asr/nnet/ds2_nnet.cc index 22c7f61b..f77c0a60 100644 --- a/speechx/speechx/asr/nnet/ds2_nnet.cc +++ b/speechx/speechx/asr/nnet/ds2_nnet.cc @@ -14,7 +14,7 @@ #include "nnet/ds2_nnet.h" -#include "absl/strings/str_split.h" +#include "utils/strings.h" namespace ppspeech { @@ -26,16 +26,16 @@ using std::vector; void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) { std::vector cache_names; - cache_names = absl::StrSplit(opts.cache_names, ","); + cache_names = StrSplit(opts.cache_names, ","); std::vector cache_shapes; - cache_shapes = absl::StrSplit(opts.cache_shape, ","); + cache_shapes = StrSplit(opts.cache_shape, ","); assert(cache_shapes.size() == cache_names.size()); cache_encouts_.clear(); cache_names_idx_.clear(); for (size_t i = 0; i < cache_shapes.size(); i++) { std::vector tmp_shape; - tmp_shape = absl::StrSplit(cache_shapes[i], "-"); + tmp_shape = StrSplit(cache_shapes[i], "-"); std::vector cur_shape; std::transform(tmp_shape.begin(), tmp_shape.end(), @@ -74,8 +74,8 @@ PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) { LOG(INFO) << "start to check the predictor input and output names"; LOG(INFO) << "input names: " << opts.input_names; LOG(INFO) << "output names: " << opts.output_names; - vector input_names_vec = absl::StrSplit(opts.input_names, ","); - vector output_names_vec = absl::StrSplit(opts.output_names, ","); + std::vector input_names_vec = StrSplit(opts.input_names, ","); + std::vector output_names_vec = StrSplit(opts.output_names, ","); paddle_infer::Predictor* predictor = GetPredictor(); diff --git a/speechx/speechx/asr/server/websocket/CMakeLists.txt b/speechx/speechx/asr/server/websocket/CMakeLists.txt index cafbbec7..9991e47b 100644 --- a/speechx/speechx/asr/server/websocket/CMakeLists.txt +++ b/speechx/speechx/asr/server/websocket/CMakeLists.txt @@ -10,4 +10,4 @@ target_link_libraries(websocket_server_main PUBLIC fst websocket ${DEPS}) add_executable(websocket_client_main ${CMAKE_CURRENT_SOURCE_DIR}/websocket_client_main.cc) target_include_directories(websocket_client_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi) -target_link_libraries(websocket_client_main PUBLIC fst websocket ${DEPS}) +target_link_libraries(websocket_client_main PUBLIC fst websocket ${DEPS}) \ No newline at end of file diff --git a/speechx/speechx/common/utils/CMakeLists.txt b/speechx/speechx/common/utils/CMakeLists.txt index c1e875be..c47b25c0 100644 --- a/speechx/speechx/common/utils/CMakeLists.txt +++ b/speechx/speechx/common/utils/CMakeLists.txt @@ -2,4 +2,18 @@ add_library(utils file_utils.cc math.cc -) \ No newline at end of file + strings.cc +) + + +if(WITH_TESTING) + enable_testing() + link_libraries(gtest_main gmock) + + add_executable(strings_test strings_test.cc) + target_link_libraries(strings_test PUBLIC utils) + add_test( + NAME strings_test + COMMAND strings_test + ) +endif() \ No newline at end of file diff --git a/speechx/speechx/common/utils/math.cc b/speechx/speechx/common/utils/math.cc index 71656cb3..e5832cbd 100644 --- a/speechx/speechx/common/utils/math.cc +++ b/speechx/speechx/common/utils/math.cc @@ -15,13 +15,14 @@ // limitations under the License. #include "utils/math.h" +#include "base/basic_types.h" #include #include #include #include - -#include "base/common.h" +#include +#include namespace ppspeech { diff --git a/speechx/speechx/common/utils/strings.cc b/speechx/speechx/common/utils/strings.cc new file mode 100644 index 00000000..6aa8af47 --- /dev/null +++ b/speechx/speechx/common/utils/strings.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "utils/strings.h" + +namespace ppspeech { + +std::vector StrSplit(const std::string& str, const char *delim, bool omit_empty_string){ + std::vector outs; + int start = 0; + int end = str.size(); + int found = 0; + while(found != std::string::npos){ + found = str.find_first_of(delim, start); + // start != end condition is for when the delimiter is at the end + if (!omit_empty_string || (found != start && start != end)){ + outs.push_back(str.substr(start, found - start)); + } + start = found + 1; + } + + return outs; +} + + +std::string StrJoin(const std::vector& strs, const char* delim) { + std::stringstream ss; + for (ssize_t i = 0; i < strs.size(); ++i){ + ss << strs[i]; + if ( i < strs.size() -1){ + ss << std::string(delim); + } + } + return ss.str(); +} + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/common/utils/strings.h b/speechx/speechx/common/utils/strings.h new file mode 100644 index 00000000..e2629164 --- /dev/null +++ b/speechx/speechx/common/utils/strings.h @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace ppspeech { + +std::vector StrSplit(const std::string& str, const char *delim, bool omit_empty_string=true); + +std::string StrJoin(const std::vector& strs, const char* delim); + +} // namespace ppspeech \ No newline at end of file diff --git a/speechx/speechx/common/utils/strings_test.cc b/speechx/speechx/common/utils/strings_test.cc new file mode 100644 index 00000000..a2950d32 --- /dev/null +++ b/speechx/speechx/common/utils/strings_test.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include "utils/strings.h" + +#include +#include + + +TEST(StringTest, StrSplitTest) { + using ::testing::ElementsAre; + + std::string test_str = "hello world"; + std::vector outs = ppspeech::StrSplit(test_str, " \t"); + EXPECT_THAT(outs, ElementsAre("hello", "world")); +} + + +TEST(StringTest, StrJoinTest) { + std::vector ins{"hello", "world"}; + std::string out = ppspeech::StrJoin(ins, " "); + EXPECT_THAT(out, "hello world"); +} \ No newline at end of file