From 9d32f62f48826331f53ad2eee54d9a38ca00d105 Mon Sep 17 00:00:00 2001 From: qingen Date: Thu, 20 Jan 2022 16:59:58 +0800 Subject: [PATCH] [vector] add AMI data preparation scripts --- dataset/ami/README.md | 5 -- examples/ami/README.md | 13 +++++ .../ami/diarization}/.gitignore | 0 .../ami/diarization/local}/ami_prepare.py | 1 - .../ami/diarization/local}/ami_splits.py | 0 examples/ami/diarization/local/data.sh | 47 +++++++++++++++++++ .../ami/diarization/local}/dataio.py | 0 examples/ami/diarization/path.sh | 15 ++++++ examples/ami/diarization/run.sh | 14 ++++++ examples/ami/diarization/utils | 1 + {paddlespeech/vector/utils => utils}/DER.py | 0 .../vector/utils => utils}/md-eval.pl | 0 12 files changed, 90 insertions(+), 6 deletions(-) delete mode 100644 dataset/ami/README.md create mode 100644 examples/ami/README.md rename {dataset/ami => examples/ami/diarization}/.gitignore (100%) rename {dataset/ami => examples/ami/diarization/local}/ami_prepare.py (99%) rename {dataset/ami => examples/ami/diarization/local}/ami_splits.py (100%) create mode 100755 examples/ami/diarization/local/data.sh rename {dataset/ami => examples/ami/diarization/local}/dataio.py (100%) create mode 100644 examples/ami/diarization/path.sh create mode 100644 examples/ami/diarization/run.sh create mode 120000 examples/ami/diarization/utils rename {paddlespeech/vector/utils => utils}/DER.py (100%) rename {paddlespeech/vector/utils => utils}/md-eval.pl (100%) diff --git a/dataset/ami/README.md b/dataset/ami/README.md deleted file mode 100644 index ac65eedf..00000000 --- a/dataset/ami/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# [AMI](https://groups.inf.ed.ac.uk/ami/corpus/) - -The AMI Meeting Corpus is a multi-modal data set consisting of 100 hours of meeting recordings. For a gentle introduction to the corpus, see the [corpus overview](https://groups.inf.ed.ac.uk/ami/corpus/overview.shtml). To access the data, follow the directions given [there](https://groups.inf.ed.ac.uk/ami/download). Around two-thirds of the data has been elicited using a scenario in which the participants play different roles in a design team, taking a design project from kick-off to completion over the course of a day. The rest consists of naturally occurring meetings in a range of domains. - -Detailed information can be found in the [documentation section](http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml). diff --git a/examples/ami/README.md b/examples/ami/README.md new file mode 100644 index 00000000..8d2ed518 --- /dev/null +++ b/examples/ami/README.md @@ -0,0 +1,13 @@ +# Speaker Diarization on AMI corpus + +## About the AMI corpus: +"The AMI Meeting Corpus consists of 100 hours of meeting recordings. The recordings use a range of signals synchronized to a common timeline. These include close-talking and far-field microphones, individual and room-view video cameras, and output from a slide projector and an electronic whiteboard. During the meetings, the participants also have unsynchronized pens available to them that record what is written. The meetings were recorded in English using three different rooms with different acoustic properties, and include mostly non-native speakers." See [ami overview](http://groups.inf.ed.ac.uk/ami/corpus/overview.shtml) for more details. + +## About the example +The script performs diarization using x-vectors(TDNN,ECAPA-TDNN) on the AMI mix-headset data. We demonstrate the use of different clustering methods: AHC, spectral. + +## How to Run +Use the following command to run diarization on AMI corpus. +`bash ./run.sh` + +## Results (DER) coming soon! :) \ No newline at end of file diff --git a/dataset/ami/.gitignore b/examples/ami/diarization/.gitignore similarity index 100% rename from dataset/ami/.gitignore rename to examples/ami/diarization/.gitignore diff --git a/dataset/ami/ami_prepare.py b/examples/ami/diarization/local/ami_prepare.py similarity index 99% rename from dataset/ami/ami_prepare.py rename to examples/ami/diarization/local/ami_prepare.py index c2f88478..b7bb8e67 100644 --- a/dataset/ami/ami_prepare.py +++ b/examples/ami/diarization/local/ami_prepare.py @@ -567,7 +567,6 @@ if __name__ == '__main__': help='Overlap duration in seconds between adjacent subsegments') args = parser.parse_args() - print(args) prepare_ami(args.data_folder, args.manual_annot_folder, args.save_folder, args.ref_rttm_dir, args.meta_data_dir) diff --git a/dataset/ami/ami_splits.py b/examples/ami/diarization/local/ami_splits.py similarity index 100% rename from dataset/ami/ami_splits.py rename to examples/ami/diarization/local/ami_splits.py diff --git a/examples/ami/diarization/local/data.sh b/examples/ami/diarization/local/data.sh new file mode 100755 index 00000000..da2c546d --- /dev/null +++ b/examples/ami/diarization/local/data.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +stage=1 + +data_folder=/home/data/ami/amicorpus #e.g., /path/to/amicorpus/ +manual_annot_folder=/home/data/ami/ami_public_manual_1.6.2 #e.g., /path/to/ami_public_manual_1.6.2/ + +save_folder=results +ref_rttm_dir=results/ref_rttms +meta_data_dir=results/metadata + +set=L + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; +set -u +set -o pipefail + +mkdir -p ${save_folder} + +if [ ${stage} -le 0 ]; then + # Download AMI corpus, You need around 10GB of free space to get whole data + # The signals are too large to package in this way, + # so you need to use the chooser to indicate which ones you wish to download + echo "Please follow https://groups.inf.ed.ac.uk/ami/download/ to download the data." + echo "Annotations: AMI manual annotations v1.6.2 " + echo "Signals: Scenario Meetings/Non Scenario Meetings, some sessions recommended but not all" + echo "media streams: Headset mix, recommended first" + exit 0; +fi + +if [ ${stage} -le 1 ]; then + echo "AMI Data preparation" + + python local/ami_prepare.py --data_folder ${data_folder} \ + --manual_annot_folder ${manual_annot_folder} \ + --save_folder ${save_folder} --ref_rttm_dir ${ref_rttm_dir} \ + --meta_data_dir ${meta_data_dir} + + if [ $? -ne 0 ]; then + echo "Prepare AMI failed. Please check log message." + exit 1 + fi + +fi + +echo "AMI data preparation done." +exit 0 diff --git a/dataset/ami/dataio.py b/examples/ami/diarization/local/dataio.py similarity index 100% rename from dataset/ami/dataio.py rename to examples/ami/diarization/local/dataio.py diff --git a/examples/ami/diarization/path.sh b/examples/ami/diarization/path.sh new file mode 100644 index 00000000..60146113 --- /dev/null +++ b/examples/ami/diarization/path.sh @@ -0,0 +1,15 @@ +export MAIN_ROOT=`realpath ${PWD}/../../../` + +export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} +export LC_ALL=C + +export PYTHONDONTWRITEBYTECODE=1 +# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} + +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ + +# model exp +#MODEL=ECAPA_TDNN +#export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}/bin diff --git a/examples/ami/diarization/run.sh b/examples/ami/diarization/run.sh new file mode 100644 index 00000000..91d4b706 --- /dev/null +++ b/examples/ami/diarization/run.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. path.sh || exit 1; +set -e + +stage=1 + + +. ${MAIN_ROOT}/utils/parse_options.sh || exit 1; + +if [ ${stage} -le 1 ]; then + # prepare data + bash ./local/data.sh || exit -1 +fi \ No newline at end of file diff --git a/examples/ami/diarization/utils b/examples/ami/diarization/utils new file mode 120000 index 00000000..973afe67 --- /dev/null +++ b/examples/ami/diarization/utils @@ -0,0 +1 @@ +../../../utils \ No newline at end of file diff --git a/paddlespeech/vector/utils/DER.py b/utils/DER.py similarity index 100% rename from paddlespeech/vector/utils/DER.py rename to utils/DER.py diff --git a/paddlespeech/vector/utils/md-eval.pl b/utils/md-eval.pl similarity index 100% rename from paddlespeech/vector/utils/md-eval.pl rename to utils/md-eval.pl