From 98aaa3810be6eb0b83613e86c32fc6f8f55a9b9c Mon Sep 17 00:00:00 2001 From: qingen Date: Wed, 12 Jan 2022 20:07:56 +0800 Subject: [PATCH 1/9] add DER scripts to calculate Diarization Error Rate --- utils/DER.py | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100755 utils/DER.py diff --git a/utils/DER.py b/utils/DER.py new file mode 100755 index 000000000..4c67c4c7b --- /dev/null +++ b/utils/DER.py @@ -0,0 +1,152 @@ +"""Calculates Diarization Error Rate (DER) which is the sum of Missed Speaker (MS), +False Alarm (FA), and Speaker Error Rate (SER) using md-eval-22.pl from NIST RT Evaluation. + +Authors + * Neville Ryant 2018 + * Nauman Dawalatabad 2020 + +Credits + This code is adapted from https://github.com/nryant/dscore +""" + +import os +import re +import subprocess +import numpy as np + +FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)") +SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+") +MISS_SPEAKER_TIME = re.compile(r"(?<=MISSED SPEAKER TIME =)[\d.]+") +FA_SPEAKER_TIME = re.compile(r"(?<=FALARM SPEAKER TIME =)[\d.]+") +ERROR_SPEAKER_TIME = re.compile(r"(?<=SPEAKER ERROR TIME =)[\d.]+") + + +def rectify(arr): + """Corrects corner cases and converts scores into percentage. + """ + + # Numerator and denominator both 0. + arr[np.isnan(arr)] = 0 + + # Numerator > 0, but denominator = 0. + arr[np.isinf(arr)] = 1 + arr *= 100.0 + + return arr + + +def DER( + ref_rttm, + sys_rttm, + ignore_overlap=False, + collar=0.25, + individual_file_scores=False, +): + """Computes Missed Speaker percentage (MS), False Alarm (FA), + Speaker Error Rate (SER), and Diarization Error Rate (DER). + + Arguments + --------- + ref_rttm : str + The path of reference/groundtruth RTTM file. + sys_rttm : str + The path of the system generated RTTM file. + individual_file : bool + If True, returns scores for each file in order. + collar : float + Forgiveness collar. + ignore_overlap : bool + If True, ignores overlapping speech during evaluation. + + Returns + ------- + MS : float array + Missed Speech. + FA : float array + False Alarms. + SER : float array + Speaker Error Rates. + DER : float array + Diarization Error Rates. + + Example + ------- + >>> import pytest + >>> pytest.skip('Skipping because of Perl dependency') + >>> ref_rttm = "../../samples/rttm_samples/ref_rttm/ES2014c.rttm" + >>> sys_rttm = "../../samples/rttm_samples/sys_rttm/ES2014c.rttm" + >>> ignore_overlap = True + >>> collar = 0.25 + >>> individual_file_scores = True + >>> Scores = DER(ref_rttm, sys_rttm, ignore_overlap, collar, individual_file_scores) + >>> print (Scores) + (array([0., 0.]), array([0., 0.]), array([7.16923618, 7.16923618]), array([7.16923618, 7.16923618])) + """ + + curr = os.path.abspath(os.path.dirname(__file__)) + mdEval = os.path.join(curr, "../../tools/der_eval/md-eval.pl") + + cmd = [ + mdEval, + "-af", + "-r", + ref_rttm, + "-s", + sys_rttm, + "-c", + str(collar), + ] + if ignore_overlap: + cmd.append("-1") + + try: + stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + + except subprocess.CalledProcessError as ex: + stdout = ex.output + + else: + stdout = stdout.decode("utf-8") + + # Get all recording IDs + file_ids = [m.strip() for m in FILE_IDS.findall(stdout)] + file_ids = [ + file_id[2:] if file_id.startswith("f=") else file_id + for file_id in file_ids + ] + + scored_speaker_times = np.array( + [float(m) for m in SCORED_SPEAKER_TIME.findall(stdout)] + ) + + miss_speaker_times = np.array( + [float(m) for m in MISS_SPEAKER_TIME.findall(stdout)] + ) + + fa_speaker_times = np.array( + [float(m) for m in FA_SPEAKER_TIME.findall(stdout)] + ) + + error_speaker_times = np.array( + [float(m) for m in ERROR_SPEAKER_TIME.findall(stdout)] + ) + + with np.errstate(invalid="ignore", divide="ignore"): + tot_error_times = ( + miss_speaker_times + fa_speaker_times + error_speaker_times + ) + miss_speaker_frac = miss_speaker_times / scored_speaker_times + fa_speaker_frac = fa_speaker_times / scored_speaker_times + sers_frac = error_speaker_times / scored_speaker_times + ders_frac = tot_error_times / scored_speaker_times + + # Values in percentage of scored_speaker_time + miss_speaker = rectify(miss_speaker_frac) + fa_speaker = rectify(fa_speaker_frac) + sers = rectify(sers_frac) + ders = rectify(ders_frac) + + if individual_file_scores: + return miss_speaker, fa_speaker, sers, ders + else: + return miss_speaker[-1], fa_speaker[-1], sers[-1], ders[-1] From 03a5750276950176d72f0d1a997fb17c19511d48 Mon Sep 17 00:00:00 2001 From: qingen Date: Fri, 14 Jan 2022 16:32:05 +0800 Subject: [PATCH 2/9] [vector] add DER scripts to calculate Diarization Error Rate --- utils/DER.py | 31 +- utils/md-eval.pl | 2938 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 2967 insertions(+), 2 deletions(-) create mode 100755 utils/md-eval.pl diff --git a/utils/DER.py b/utils/DER.py index 4c67c4c7b..25003b0a4 100755 --- a/utils/DER.py +++ b/utils/DER.py @@ -4,11 +4,13 @@ False Alarm (FA), and Speaker Error Rate (SER) using md-eval-22.pl from NIST RT Authors * Neville Ryant 2018 * Nauman Dawalatabad 2020 + * Qingen Zhao 2021 Credits This code is adapted from https://github.com/nryant/dscore """ - +import argparse +from distutils.util import strtobool import os import re import subprocess @@ -84,7 +86,7 @@ def DER( """ curr = os.path.abspath(os.path.dirname(__file__)) - mdEval = os.path.join(curr, "../../tools/der_eval/md-eval.pl") + mdEval = os.path.join(curr, "./md-eval.pl") cmd = [ mdEval, @@ -150,3 +152,28 @@ def DER( return miss_speaker, fa_speaker, sers, ders else: return miss_speaker[-1], fa_speaker[-1], sers[-1], ders[-1] + +if __name__ == '__main__': + + parser = argparse.ArgumentParser(description='Compute Diarization Error Rate') + parser.add_argument( + '--ref_rttm', required=True, help='the path of reference/groundtruth RTTM file') + parser.add_argument( + '--sys_rttm', required=True, help='the path of the system generated RTTM file') + parser.add_argument( + '--individual_file', + default=False, + type=strtobool, + help='if True, returns scores for each file in order') + parser.add_argument( + '--collar', default=0.25, type=float, help='forgiveness collar') + parser.add_argument( + '--ignore_overlap', + default=False, + type=strtobool, + help='if True, ignores overlapping speech during evaluation') + args = parser.parse_args() + print(args) + + der = DER(args.ref_rttm, args.sys_rttm) + print("miss_speaker: %.3f%% fa_speaker: %.3f%% sers: %.3f%% ders: %.3f%%" % (der[0], der[1], der[2], der[-1])) \ No newline at end of file diff --git a/utils/md-eval.pl b/utils/md-eval.pl new file mode 100755 index 000000000..0356b927f --- /dev/null +++ b/utils/md-eval.pl @@ -0,0 +1,2938 @@ +#!/usr/bin/perl -w +################################# +# NIST. (2009). The 2009 (RT-09) Rich Transcription Meeting Recognition Evaluation Plan. +# https://web.archive.org/web/20100606041157if_/http://www.itl.nist.gov/iad/mig/tests/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf +# Source (dscore): https://github.com/nryant/dscore/blob/master/scorelib/md-eval-22.pl +################################# +# BSD 2-Clause License +# +# Copyright (c) 2018, Neville Ryant +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +################################# + +use strict; + +my $version = "22"; + +################################# +# History: +# +# version 22: * JGF: added an option '-m FILE' to hold a CSV speaker map file. +# +# version 21: * JGF: added a flag '-n' to not remove the directory paths from the source +# files in the UEM file. +# +# version 20: * change metadata discard rule: rather than discard if the midpoint +# (or endpoint) of the metadata object lies in a no-eval zone, discard +# if there is ANY overlap whatsoever between the metadata object and +# a no-eval zone. This holds for system output objects only if the +# system output metadata object is not mapped to a ref object. +# * optimize IP and SU mapping by giving a secondary bonus mapping score +# to candidate ref-sys MD map pairs if the end-words of both coincide. +# +# version 19: * bug fix in subroutine speakers_match +# * bug fix in tag_ref_words_with_metadata_info +# +# version 18: * cosmetic fix to error message in eval_condition +# * added conditional output options for word coverage performance +# * added secondary MD word coverage optimization to word alignment +# * further optimize word alignment by considering MD subtypes +# * further optimize MD alignment by considering MD subtypes +# * add a new SU discard rule: discard if TEND in no-eval zone +# * enforce legal values for su_extent_limit +# +# version 17: create_speaker_segs modified to accommodate the same speaker +# having multiple overlapping speaker segments. (This is an +# error and pathological condition, but the system must either +# disallow (abort on) the condition, or perform properly under +# the pathological condition. The second option is chosen.) +# +# version 16: * If neither -w nor -W is specified, suppress warnings about +# ref SPEAKER records subsuming no lexemes. +# * Output the overall speaker diarization stats after the +# stats for the individual files +# * Do not alter the case of alphabetic characters in the filename +# field from the ref rttm file +# * Made the format of the overall speaker error line more similar to +# the corresponding line of output from SpkrSegEval, to facilitate +# use of existing "grep" commands in existing scripts. +# +# version 15: * bug fix in create_speaker_segs to accommodate +# contiguous same-speaker segments +# * added conditional file/channel scoring to +# speaker diarization evaluation +# +# version 14: bug fix in md_score +# +# version 13: add DISCOURSE_RESPONSE as a FILLER subtype +# +# version 12: make REF LEXEMES optional if they aren't required +# +# version 11: change default for noscore MD regions +# +# version 10: bug fix +# +# version 09: +# * avoid crash when metadata discard yields no metadata +# * make evaluated ref_wds sensitive to metadata type +# * defer discarding of system output metadata until after +# metadata mapping, then discard only unmapped events. +# * extend 1-speaker scoring inhibition to metadata +# * eliminate demand for SPKR-INFO subtype for speakers +# * correct ref count of IP and SU exact boundary words +# * add official RT-04F scores +# * add conditional analyses for file/chnl/spkr/gender +# +# version 08: +# * bug fixes speaker diarization scoring +# - count of EVAL_WORDS corrected +# - no-score extended to nearest SPEAKER boundary +# +# version 07: +# * warning issued when discarding metadata events +# that cover LEXEMEs in the evaluation region +# +# version 06: +# * eliminated unused speakers from speaker scoring +# * changed discard algorithm for unannotated SU's and +# complex EDIT's to discard sys SU's and EDIT's when +# their midpoints overlap (rather than ANY overlap). +# * fixed display_metadata_mapping +# +# version 05: +# * upgraded display_metadata_mapping +# +# version 04: +# * diagnostic metadata mapping output added +# * uem_from_rttm bug fix +# +# version 03: +# * adjusted times used for speaker diarization +# * changed usage of max_extend to agree with cookbook +# +# version 02: speaker diarization evaluation added +# +# version 01: a merged version of df-eval-v14 and su-eval-v16 +# +################################# + +#global data +my $epsilon = 1E-8; +my $miss_name = " MISS"; +my $fa_name = " FALSE ALARM"; +my %rttm_datatypes = (SEGMENT => {eval => 1, "" => 1}, + NOSCORE => {"" => 1}, + NO_RT_METADATA => {"" => 1}, + LEXEME => {lex => 1, fp => 1, frag => 1, "un-lex" => 1, + "for-lex" => 1, alpha => 1, acronym => 1, + interjection => 1, propernoun => 1, other => 1}, + "NON-LEX" => {laugh => 1, breath => 1, lipsmack => 1, + cough => 1, sneeze => 1, other => 1}, + "NON-SPEECH" => {noise => 1, music => 1, other => 1}, + FILLER => {filled_pause => 1, discourse_marker => 1, + discourse_response => 1, explicit_editing_term => 1, + other => 1}, + EDIT => {repetition => 1, restart => 1, revision => 1, + simple => 1, complex => 1, other => 1}, + IP => {edit => 1, filler => 1, "edit&filler" => 1, + other => 1}, + SU => {statement => 1, backchannel => 1, question => 1, + incomplete => 1, unannotated => 1, other => 1}, + CB => {coordinating => 1, clausal => 1, other => 1}, + "A/P" => {"" => 1}, + SPEAKER => {"" => 1}, + "SPKR-INFO" => {adult_male => 1, adult_female => 1, child => 1, unknown => 1}); +my %md_subtypes = (FILLER => $rttm_datatypes{FILLER}, + EDIT => $rttm_datatypes{EDIT}, + IP => $rttm_datatypes{IP}, + SU => $rttm_datatypes{SU}); +my %spkr_subtypes = (adult_male => 1, adult_female => 1, child => 1, unknown => 1); + +my $noeval_mds = { + DEFAULT => { + NOSCORE => {"" => 1}, + NO_RT_METADATA => {"" => 1}, + }, +}; +my $noscore_mds = { + DEFAULT => { + NOSCORE => {"" => 1}, + LEXEME => {"un-lex" => 1}, + SU => {unannotated => 1}, + }, + MIN => { + NOSCORE => {"" => 1}, + SU => {unannotated => 1}, + }, + FRAG_UNLEX => { + NOSCORE => {"" => 1}, + LEXEME => {frag => 1, "un-lex" => 1}, + SU => {unannotated => 1}, + }, + FRAG => { + NOSCORE => {"" => 1}, + LEXEME => {frag => 1}, + SU => {unannotated => 1}, + }, + NONE => { + }, +}; +my $noeval_sds = { + DEFAULT => { + NOSCORE => {"" => 1}, + }, +}; +my $noscore_sds = { + DEFAULT => { + NOSCORE => {"" => 1}, + "NON-LEX" => {laugh => 1, breath => 1, lipsmack => 1, + cough => 1, sneeze => 1, other => 1}, + }, +}; + +my %speaker_map; + +my $default_extend = 0.50; #the maximum time (in seconds) to extend a no-score zone +my $default_collar = 0.00; #the no-score collar (in +/- seconds) to attach to SPEAKER boundaries +my $default_tgap = 1.00; #the max gap (in seconds) between matching ref/sys words +my $default_Tgap = 1.00; #the max gap (in seconds) between matching ref/sys metadata events +my $default_Wgap = 0.10; #the max gap (in words) between matching ref/sys metadata events +my $default_su_time_limit = 0.50; #the max extent (in seconds) to match for SU's +my $default_su_word_limit = 2.00; #the max extent (in words) to match for SU's +my $default_word_delta_score = 10.0; #the max delta score for word-based DP alignment of ref/sys words +my $default_time_delta_score = 1.00; #the max delta score for time-based DP alignment of ref/sys words + +my $usage = "\n\nUsage: $0 [-h] -r -s \n\n". + "Description: md-eval evaluates EARS metadata detection performance\n". + " by comparing system metadata output data with reference data\n". + "INPUT:\n". + " -R A file containing a list of the reference metadata files\n". + " being evaluated, in RTTM format. If the word-mediated alignment\n". + " option is used then this data must include reference STT data\n". + " in addition to the metadata being evaluated.\n". + " OR\n". + " -r A file containing reference metadata, in RTTM format\n\n". + " -S A file containing a list of the system output metadata\n". + " files to be evaluated, in RTTM format. If the word-mediated\n". + " alignment option is used then this data must include system STT\n". + " output data in addition to the metadata to be evaluated.\n". + " OR\n". + " -s A file containing system output metadata, in RTTM format\n\n". + " input options:\n". + " -x to include complex edits in the analysis and scoring.\n". + " -w for word-mediated alignment.\n". + " * The default (time-mediated) alignment aligns ref and sys metadata\n". + " according to the time overlap of the original ref and sys metadata\n". + " time intervals.\n". + " * Word-mediated alignment aligns ref and sys metadata according to\n". + " the alignment of the words that are subsumed within the metadata\n". + " time intervals.\n". + " -W for word-optimized mapping.\n". + " * The default (time-optimized) mapping maps ref and sys metadata\n". + " so as to maximize the time overlap of mapped metadata events.\n". + " * Word-optimized mapping maps ref and sys metadata so as to\n". + " maximize the overlap in terms of the number of reference words\n". + " that are subsumed within the overlapping time interval.\n". + " -a Conditional analysis options for metadata detection performance:\n". + " c for performance versus channel,\n". + " f for performance versus file,\n". + " g for performance versus gender, and\n". + " s for performance versus speaker.\n". + " -A Conditional analysis options for word coverage performance:\n". + " c for performance versus channel,\n". + " f for performance versus file,\n". + " -t