You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
253 lines
5.4 KiB
253 lines
5.4 KiB
"""
|
|
AMI corpus contained 100 hours of meeting recording.
|
|
This script returns the standard train, dev and eval split for AMI corpus.
|
|
For more information on dataset please refer to http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml
|
|
|
|
Authors
|
|
* qingenz123@126.com (Qingen ZHAO) 2022
|
|
|
|
Credits
|
|
|
|
"""
|
|
|
|
ALLOWED_OPTIONS = ["scenario_only", "full_corpus", "full_corpus_asr"]
|
|
|
|
|
|
def get_AMI_split(split_option):
|
|
"""
|
|
Prepares train, dev, and test sets for given split_option
|
|
|
|
Arguments
|
|
---------
|
|
split_option: str
|
|
The standard split option.
|
|
Allowed options: "scenario_only", "full_corpus", "full_corpus_asr"
|
|
|
|
Returns
|
|
-------
|
|
Meeting IDs for train, dev, and test sets for given split_option
|
|
"""
|
|
|
|
if split_option not in ALLOWED_OPTIONS:
|
|
print(
|
|
f'Invalid split "{split_option}" requested!\nValid split_options are: ',
|
|
ALLOWED_OPTIONS,
|
|
)
|
|
return
|
|
|
|
if split_option == "scenario_only":
|
|
|
|
train_set = [
|
|
"ES2002",
|
|
"ES2005",
|
|
"ES2006",
|
|
"ES2007",
|
|
"ES2008",
|
|
"ES2009",
|
|
"ES2010",
|
|
"ES2012",
|
|
"ES2013",
|
|
"ES2015",
|
|
"ES2016",
|
|
"IS1000",
|
|
"IS1001",
|
|
"IS1002",
|
|
"IS1003",
|
|
"IS1004",
|
|
"IS1005",
|
|
"IS1006",
|
|
"IS1007",
|
|
"TS3005",
|
|
"TS3008",
|
|
"TS3009",
|
|
"TS3010",
|
|
"TS3011",
|
|
"TS3012",
|
|
]
|
|
|
|
dev_set = [
|
|
"ES2003",
|
|
"ES2011",
|
|
"IS1008",
|
|
"TS3004",
|
|
"TS3006",
|
|
]
|
|
|
|
test_set = [
|
|
"ES2004",
|
|
"ES2014",
|
|
"IS1009",
|
|
"TS3003",
|
|
"TS3007",
|
|
]
|
|
|
|
if split_option == "full_corpus":
|
|
# List of train: SA (TRAINING PART OF SEEN DATA)
|
|
train_set = [
|
|
"ES2002",
|
|
"ES2005",
|
|
"ES2006",
|
|
"ES2007",
|
|
"ES2008",
|
|
"ES2009",
|
|
"ES2010",
|
|
"ES2012",
|
|
"ES2013",
|
|
"ES2015",
|
|
"ES2016",
|
|
"IS1000",
|
|
"IS1001",
|
|
"IS1002",
|
|
"IS1003",
|
|
"IS1004",
|
|
"IS1005",
|
|
"IS1006",
|
|
"IS1007",
|
|
"TS3005",
|
|
"TS3008",
|
|
"TS3009",
|
|
"TS3010",
|
|
"TS3011",
|
|
"TS3012",
|
|
"EN2001",
|
|
"EN2003",
|
|
"EN2004",
|
|
"EN2005",
|
|
"EN2006",
|
|
"EN2009",
|
|
"IN1001",
|
|
"IN1002",
|
|
"IN1005",
|
|
"IN1007",
|
|
"IN1008",
|
|
"IN1009",
|
|
"IN1012",
|
|
"IN1013",
|
|
"IN1014",
|
|
"IN1016",
|
|
]
|
|
|
|
# List of dev: SB (DEV PART OF SEEN DATA)
|
|
dev_set = [
|
|
"ES2003",
|
|
"ES2011",
|
|
"IS1008",
|
|
"TS3004",
|
|
"TS3006",
|
|
"IB4001",
|
|
"IB4002",
|
|
"IB4003",
|
|
"IB4004",
|
|
"IB4010",
|
|
"IB4011",
|
|
]
|
|
|
|
# List of test: SC (UNSEEN DATA FOR EVALUATION)
|
|
# Note that IB4005 does not appear because it has speakers in common with two sets of data.
|
|
test_set = [
|
|
"ES2004",
|
|
"ES2014",
|
|
"IS1009",
|
|
"TS3003",
|
|
"TS3007",
|
|
"EN2002",
|
|
]
|
|
|
|
if split_option == "full_corpus_asr":
|
|
train_set = [
|
|
"ES2002",
|
|
"ES2003",
|
|
"ES2005",
|
|
"ES2006",
|
|
"ES2007",
|
|
"ES2008",
|
|
"ES2009",
|
|
"ES2010",
|
|
"ES2012",
|
|
"ES2013",
|
|
"ES2014",
|
|
"ES2015",
|
|
"ES2016",
|
|
"IS1000",
|
|
"IS1001",
|
|
"IS1002",
|
|
"IS1003",
|
|
"IS1004",
|
|
"IS1005",
|
|
"IS1006",
|
|
"IS1007",
|
|
"TS3005",
|
|
"TS3006",
|
|
"TS3007",
|
|
"TS3008",
|
|
"TS3009",
|
|
"TS3010",
|
|
"TS3011",
|
|
"TS3012",
|
|
"EN2001",
|
|
"EN2003",
|
|
"EN2004",
|
|
"EN2005",
|
|
"EN2006",
|
|
"EN2009",
|
|
"IN1001",
|
|
"IN1002",
|
|
"IN1005",
|
|
"IN1007",
|
|
"IN1008",
|
|
"IN1009",
|
|
"IN1012",
|
|
"IN1013",
|
|
"IN1014",
|
|
"IN1016",
|
|
]
|
|
|
|
dev_set0 = [
|
|
"ES2011",
|
|
"IS1008",
|
|
"TS3004",
|
|
"IB4001",
|
|
"IB4002",
|
|
"IB4003",
|
|
"IB4004",
|
|
"IB4010",
|
|
"IB4011",
|
|
]
|
|
|
|
test_set0 = [
|
|
"ES2004",
|
|
"IS1009",
|
|
"TS3003",
|
|
"EN2002",
|
|
]
|
|
|
|
dev_set1 = [
|
|
"ES2011a",
|
|
"IS1008a",
|
|
"TS3004a",
|
|
"IB4001",
|
|
"IB4002",
|
|
"IB4003",
|
|
"IB4004",
|
|
]
|
|
test_set1 = [
|
|
"ES2004a",
|
|
"IS1009a",
|
|
"TS3003a",
|
|
"EN2001a",
|
|
]
|
|
|
|
train_set = [
|
|
"IB4001",
|
|
"IB4002",
|
|
"IB4003",
|
|
"IB4004",
|
|
]
|
|
dev_set = [
|
|
"IB4002",
|
|
]
|
|
test_set = [
|
|
"IB4004",
|
|
]
|
|
return train_set, dev_set, test_set
|