#!/usr/bin/perl -w ################################# # NIST. (2009). The 2009 (RT-09) Rich Transcription Meeting Recognition Evaluation Plan. # https://web.archive.org/web/20100606041157if_/http://www.itl.nist.gov/iad/mig/tests/rt/2009/docs/rt09-meeting-eval-plan-v2.pdf # Source (dscore): https://github.com/nryant/dscore/blob/master/scorelib/md-eval-22.pl ################################# # BSD 2-Clause License # # Copyright (c) 2018, Neville Ryant # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # * Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ################################# use strict; my $version = "22"; ################################# # History: # # version 22: * JGF: added an option '-m FILE' to hold a CSV speaker map file. # # version 21: * JGF: added a flag '-n' to not remove the directory paths from the source # files in the UEM file. # # version 20: * change metadata discard rule: rather than discard if the midpoint # (or endpoint) of the metadata object lies in a no-eval zone, discard # if there is ANY overlap whatsoever between the metadata object and # a no-eval zone. This holds for system output objects only if the # system output metadata object is not mapped to a ref object. # * optimize IP and SU mapping by giving a secondary bonus mapping score # to candidate ref-sys MD map pairs if the end-words of both coincide. # # version 19: * bug fix in subroutine speakers_match # * bug fix in tag_ref_words_with_metadata_info # # version 18: * cosmetic fix to error message in eval_condition # * added conditional output options for word coverage performance # * added secondary MD word coverage optimization to word alignment # * further optimize word alignment by considering MD subtypes # * further optimize MD alignment by considering MD subtypes # * add a new SU discard rule: discard if TEND in no-eval zone # * enforce legal values for su_extent_limit # # version 17: create_speaker_segs modified to accommodate the same speaker # having multiple overlapping speaker segments. (This is an # error and pathological condition, but the system must either # disallow (abort on) the condition, or perform properly under # the pathological condition. The second option is chosen.) # # version 16: * If neither -w nor -W is specified, suppress warnings about # ref SPEAKER records subsuming no lexemes. # * Output the overall speaker diarization stats after the # stats for the individual files # * Do not alter the case of alphabetic characters in the filename # field from the ref rttm file # * Made the format of the overall speaker error line more similar to # the corresponding line of output from SpkrSegEval, to facilitate # use of existing "grep" commands in existing scripts. # # version 15: * bug fix in create_speaker_segs to accommodate # contiguous same-speaker segments # * added conditional file/channel scoring to # speaker diarization evaluation # # version 14: bug fix in md_score # # version 13: add DISCOURSE_RESPONSE as a FILLER subtype # # version 12: make REF LEXEMES optional if they aren't required # # version 11: change default for noscore MD regions # # version 10: bug fix # # version 09: # * avoid crash when metadata discard yields no metadata # * make evaluated ref_wds sensitive to metadata type # * defer discarding of system output metadata until after # metadata mapping, then discard only unmapped events. # * extend 1-speaker scoring inhibition to metadata # * eliminate demand for SPKR-INFO subtype for speakers # * correct ref count of IP and SU exact boundary words # * add official RT-04F scores # * add conditional analyses for file/chnl/spkr/gender # # version 08: # * bug fixes speaker diarization scoring # - count of EVAL_WORDS corrected # - no-score extended to nearest SPEAKER boundary # # version 07: # * warning issued when discarding metadata events # that cover LEXEMEs in the evaluation region # # version 06: # * eliminated unused speakers from speaker scoring # * changed discard algorithm for unannotated SU's and # complex EDIT's to discard sys SU's and EDIT's when # their midpoints overlap (rather than ANY overlap). # * fixed display_metadata_mapping # # version 05: # * upgraded display_metadata_mapping # # version 04: # * diagnostic metadata mapping output added # * uem_from_rttm bug fix # # version 03: # * adjusted times used for speaker diarization # * changed usage of max_extend to agree with cookbook # # version 02: speaker diarization evaluation added # # version 01: a merged version of df-eval-v14 and su-eval-v16 # ################################# #global data my $epsilon = 1E-8; my $miss_name = " MISS"; my $fa_name = " FALSE ALARM"; my %rttm_datatypes = (SEGMENT => {eval => 1, "" => 1}, NOSCORE => {"" => 1}, NO_RT_METADATA => {"" => 1}, LEXEME => {lex => 1, fp => 1, frag => 1, "un-lex" => 1, "for-lex" => 1, alpha => 1, acronym => 1, interjection => 1, propernoun => 1, other => 1}, "NON-LEX" => {laugh => 1, breath => 1, lipsmack => 1, cough => 1, sneeze => 1, other => 1}, "NON-SPEECH" => {noise => 1, music => 1, other => 1}, FILLER => {filled_pause => 1, discourse_marker => 1, discourse_response => 1, explicit_editing_term => 1, other => 1}, EDIT => {repetition => 1, restart => 1, revision => 1, simple => 1, complex => 1, other => 1}, IP => {edit => 1, filler => 1, "edit&filler" => 1, other => 1}, SU => {statement => 1, backchannel => 1, question => 1, incomplete => 1, unannotated => 1, other => 1}, CB => {coordinating => 1, clausal => 1, other => 1}, "A/P" => {"" => 1}, SPEAKER => {"" => 1}, "SPKR-INFO" => {adult_male => 1, adult_female => 1, child => 1, unknown => 1}); my %md_subtypes = (FILLER => $rttm_datatypes{FILLER}, EDIT => $rttm_datatypes{EDIT}, IP => $rttm_datatypes{IP}, SU => $rttm_datatypes{SU}); my %spkr_subtypes = (adult_male => 1, adult_female => 1, child => 1, unknown => 1); my $noeval_mds = { DEFAULT => { NOSCORE => {"" => 1}, NO_RT_METADATA => {"" => 1}, }, }; my $noscore_mds = { DEFAULT => { NOSCORE => {"" => 1}, LEXEME => {"un-lex" => 1}, SU => {unannotated => 1}, }, MIN => { NOSCORE => {"" => 1}, SU => {unannotated => 1}, }, FRAG_UNLEX => { NOSCORE => {"" => 1}, LEXEME => {frag => 1, "un-lex" => 1}, SU => {unannotated => 1}, }, FRAG => { NOSCORE => {"" => 1}, LEXEME => {frag => 1}, SU => {unannotated => 1}, }, NONE => { }, }; my $noeval_sds = { DEFAULT => { NOSCORE => {"" => 1}, }, }; my $noscore_sds = { DEFAULT => { NOSCORE => {"" => 1}, "NON-LEX" => {laugh => 1, breath => 1, lipsmack => 1, cough => 1, sneeze => 1, other => 1}, }, }; my %speaker_map; my $default_extend = 0.50; #the maximum time (in seconds) to extend a no-score zone my $default_collar = 0.00; #the no-score collar (in +/- seconds) to attach to SPEAKER boundaries my $default_tgap = 1.00; #the max gap (in seconds) between matching ref/sys words my $default_Tgap = 1.00; #the max gap (in seconds) between matching ref/sys metadata events my $default_Wgap = 0.10; #the max gap (in words) between matching ref/sys metadata events my $default_su_time_limit = 0.50; #the max extent (in seconds) to match for SU's my $default_su_word_limit = 2.00; #the max extent (in words) to match for SU's my $default_word_delta_score = 10.0; #the max delta score for word-based DP alignment of ref/sys words my $default_time_delta_score = 1.00; #the max delta score for time-based DP alignment of ref/sys words my $usage = "\n\nUsage: $0 [-h] -r -s \n\n". "Description: md-eval evaluates EARS metadata detection performance\n". " by comparing system metadata output data with reference data\n". "INPUT:\n". " -R A file containing a list of the reference metadata files\n". " being evaluated, in RTTM format. If the word-mediated alignment\n". " option is used then this data must include reference STT data\n". " in addition to the metadata being evaluated.\n". " OR\n". " -r A file containing reference metadata, in RTTM format\n\n". " -S A file containing a list of the system output metadata\n". " files to be evaluated, in RTTM format. If the word-mediated\n". " alignment option is used then this data must include system STT\n". " output data in addition to the metadata to be evaluated.\n". " OR\n". " -s A file containing system output metadata, in RTTM format\n\n". " input options:\n". " -x to include complex edits in the analysis and scoring.\n". " -w for word-mediated alignment.\n". " * The default (time-mediated) alignment aligns ref and sys metadata\n". " according to the time overlap of the original ref and sys metadata\n". " time intervals.\n". " * Word-mediated alignment aligns ref and sys metadata according to\n". " the alignment of the words that are subsumed within the metadata\n". " time intervals.\n". " -W for word-optimized mapping.\n". " * The default (time-optimized) mapping maps ref and sys metadata\n". " so as to maximize the time overlap of mapped metadata events.\n". " * Word-optimized mapping maps ref and sys metadata so as to\n". " maximize the overlap in terms of the number of reference words\n". " that are subsumed within the overlapping time interval.\n". " -a Conditional analysis options for metadata detection performance:\n". " c for performance versus channel,\n". " f for performance versus file,\n". " g for performance versus gender, and\n". " s for performance versus speaker.\n". " -A Conditional analysis options for word coverage performance:\n". " c for performance versus channel,\n". " f for performance versus file,\n". " -t