You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/paddlespeech/s2t/io/speechbrain/dataio.py

846 lines
23 KiB

# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/dataio.py)
"""
Data reading and writing.
Authors
* Mirco Ravanelli 2020
* Aku Rouhe 2020
* Ju-Chieh Chou 2020
* Samuele Cornell 2020
* Abdel HEBA 2020
"""
import csv
import hashlib
import json
import logging
import os
import pickle
import re
import time
import numpy as np
import soundfile
logger = logging.getLogger(__name__)
import paddle
def load_data_json(json_path, replacements={}):
"""Loads JSON and recursively formats string values.
Arguments
----------
json_path : str
Path to CSV file.
replacements : dict
(Optional dict), e.g., {"data_folder": "/home/PaddleSpeech/data"}.
This is used to recursively format all string values in the data.
Returns
-------
dict
JSON data with replacements applied.
"""
with open(json_path, "r") as f:
out_json = json.load(f)
_recursive_format(out_json, replacements)
return out_json
def _recursive_format(data, replacements):
# Data: dict or list, replacements : dict
# Replaces string keys in replacements by their values
# at all levels of data (in str values)
# Works in-place.
if isinstance(data, dict):
for key, item in data.items():
if isinstance(item, dict) or isinstance(item, list):
_recursive_format(item, replacements)
elif isinstance(item, str):
data[key] = item.format_map(replacements)
# If not dict, list or str, do nothing
if isinstance(data, list):
for i, item in enumerate(data):
if isinstance(item, dict) or isinstance(item, list):
_recursive_format(item, replacements)
elif isinstance(item, str):
data[i] = item.format_map(replacements)
# If not dict, list or str, do nothing
def load_data_csv(csv_path, replacements={}):
"""Loads CSV and formats string values.
Uses the legacy CSV data format, where the CSV must have an
'ID' field.
If there is a field called duration, it is interpreted as a float.
The rest of the fields are left as they are (legacy _format and _opts fields
are not used to load the data in any special way).
Bash-like string replacements with $to_replace are supported.
Arguments
----------
csv_path : str
Path to CSV file.
replacements : dict
(Optional dict), e.g., {"data_folder": "/home/PaddleSpeech/data"}
This is used to recursively format all string values in the data.
Returns
-------
dict
CSV data with replacements applied.
"""
with open(csv_path, newline="") as csvfile:
result = {}
reader = csv.DictReader(csvfile, skipinitialspace=True)
variable_finder = re.compile(r"\$([\w.]+)")
for row in reader:
# ID:
try:
data_id = row["ID"]
del row["ID"] # This is used as a key in result, instead.
except KeyError:
raise KeyError("CSV has to have an 'ID' field, with unique ids"
" for all data points")
if data_id in result:
raise ValueError(f"Duplicate id: {data_id}")
# Replacements:
for key, value in row.items():
try:
row[key] = variable_finder.sub(
lambda match: str(replacements[match[1]]), value)
except KeyError:
raise KeyError(f"The item {value} requires replacements "
"which were not supplied.")
# Duration:
if "duration" in row:
row["duration"] = float(row["duration"])
result[data_id] = row
return result
def read_audio(waveforms_obj):
"""General audio loading, based on a custom notation.
Expected use case is in conjunction with Datasets
specified by JSON.
The custom notation:
The annotation can be just a path to a file:
"/path/to/wav1.wav"
Or can specify more options in a dict:
{"file": "/path/to/wav2.wav",
"start": 8000,
"stop": 16000
}
Arguments
----------
waveforms_obj : str, dict
Audio reading annotation, see above for format.
Returns
-------
paddle.Tensor
Audio tensor with shape: (samples, ).
"""
if isinstance(waveforms_obj, str):
audio, _ = soundfile.read(waveforms_obj, dtype="float32")
return audio
path = waveforms_obj["file"]
start = waveforms_obj.get("start", 0)
# Default stop to start -> if not specified, num_frames becomes 0
stop = waveforms_obj.get("stop", start)
num_frames = stop - start
audio, fs = soundfile.read(
path, start=start, stop=start + num_frames, dtype="float32")
return audio
def read_audio_multichannel(waveforms_obj):
"""General audio loading, based on a custom notation.
Expected use case is in conjunction with Datasets
specified by JSON.
The custom notation:
The annotation can be just a path to a file:
"/path/to/wav1.wav"
Multiple (possibly multi-channel) files can be specified, as long as they
have the same length:
{"files": [
"/path/to/wav1.wav",
"/path/to/wav2.wav"
]
}
Or you can specify a single file more succinctly:
{"files": "/path/to/wav2.wav"}
Offset number samples and stop number samples also can be specified to read
only a segment within the files.
{"files": [
"/path/to/wav1.wav",
"/path/to/wav2.wav"
]
"start": 8000
"stop": 16000
}
Arguments
----------
waveforms_obj : str, dict
Audio reading annotation, see above for format.
Returns
-------
paddle.Tensor
Audio tensor with shape: (samples, ).
"""
if isinstance(waveforms_obj, str):
audio, _ = soundfile.read(waveforms_obj, dtype="float32")
audio = paddle.to_tensor(audio)
return audio
files = waveforms_obj["files"]
if not isinstance(files, list):
files = [files]
waveforms = []
start = waveforms_obj.get("start", 0)
# Default stop to start -> if not specified, num_frames becomes 0
stop = waveforms_obj.get("stop", start - 1)
num_frames = stop - start
for f in files:
audio, fs = soundfile.read(
path, start=start, stop=start + num_frames, dtype="float32")
audio = paddle.to_tensor(audio)
waveforms.append(audio)
out = paddle.concat(waveforms, 0)
return out
def write_audio(filepath, audio, samplerate):
"""Write audio on disk. It is basically a wrapper to support saving
audio signals in format (audio, channels).
Arguments
---------
filepath: path
Path where to save the audio file.
audio : paddle.Tensor
Audio file in the expected format (signal, channels).
samplerate: int
Sample rate (e.g., 16000).
"""
if len(audio.shape) == 2:
audio = audio.transpose([1, 0])
elif len(audio.shape) == 1:
audio = audio.unsqueeze(0)
soundfile.write(filepath, audio, samplerate)
def load_pickle(pickle_path):
"""Utility function for loading .pkl pickle files.
Arguments
---------
pickle_path : str
Path to pickle file.
Returns
-------
out : object
Python object loaded from pickle.
"""
with open(pickle_path, "rb") as f:
out = pickle.load(f)
return out
def to_floatTensor(x: (list, tuple, np.ndarray)):
"""
Arguments
---------
x : (list, tuple, np.ndarray)
Input data to be converted to paddle float.
Returns
-------
tensor : paddle.tensor
Data now in paddle.tensor float datatype.
"""
return paddle.to_tensor(x, dtype='float32')
def to_doubleTensor(x: (list, tuple, np.ndarray)):
"""
Arguments
---------
x : (list, tuple, np.ndarray)
Input data to be converted to paddle double.
Returns
-------
tensor : paddle.tensor
Data now in paddle.tensor double datatype.
"""
return paddle.to_tensor(x, dtype='float64')
def to_longTensor(x: (list, tuple, np.ndarray)):
"""
Arguments
---------
x : (list, tuple, np.ndarray)
Input data to be converted to paddle long.
Returns
-------
tensor : paddle.tensor
Data now in paddle.tensor long datatype.
"""
return paddle.to_tensor(x, dtype='int64')
def convert_index_to_lab(batch, ind2lab):
"""Convert a batch of integer IDs to string labels.
Arguments
---------
batch : list
List of lists, a batch of sequences.
ind2lab : dict
Mapping from integer IDs to labels.
Returns
-------
list
List of lists, same size as batch, with labels from ind2lab.
"""
return [[ind2lab[int(index)] for index in seq] for seq in batch]
def relative_time_to_absolute(batch, relative_lens, rate):
"""Converts relative length to the absolute duration.
Operates on batch level.
Arguments
---------
batch : paddle.tensor
Sequences to determine the duration for.
relative_lens : paddle.tensor
The relative length of each sequence in batch. The longest sequence in
the batch needs to have relative length 1.0.
rate : float
The rate at which sequence elements occur in real-world time. Sample
rate, if batch is raw wavs (recommended) or 1/frame_shift if batch is
features. This has to have 1/s as the unit.
Returns
------:
paddle.tensor
Duration of each sequence in seconds.
"""
max_len = batch.shape[1]
durations = paddle.round(relative_lens * max_len) / rate
return durations
class IterativeCSVWriter:
"""Write CSV files a line at a time.
Arguments
---------
outstream : file-object
A writeable stream
data_fields : list
List of the optional keys to write. Each key will be expanded,
producing three fields: key, key_format, key_opts.
"""
def __init__(self, outstream, data_fields, defaults={}):
self._outstream = outstream
self.fields = ["ID", "duration"] + self._expand_data_fields(data_fields)
self.defaults = defaults
self._outstream.write(",".join(self.fields))
def set_default(self, field, value):
"""Sets a default value for the given CSV field.
Arguments
---------
field : str
A field in the CSV.
value
The default value.
"""
if field not in self.fields:
raise ValueError(f"{field} is not a field in this CSV!")
self.defaults[field] = value
def write(self, *args, **kwargs):
"""Writes one data line into the CSV.
Arguments
---------
*args
Supply every field with a value in positional form OR.
**kwargs
Supply certain fields by key. The ID field is mandatory for all
lines, but others can be left empty.
"""
if args and kwargs:
raise ValueError(
"Use either positional fields or named fields, but not both.")
if args:
if len(args) != len(self.fields):
raise ValueError("Need consistent fields")
to_write = [str(arg) for arg in args]
if kwargs:
if "ID" not in kwargs:
raise ValueError("I'll need to see some ID")
full_vals = self.defaults.copy()
full_vals.update(kwargs)
to_write = [str(full_vals.get(field, "")) for field in self.fields]
self._outstream.write("\n")
self._outstream.write(",".join(to_write))
def write_batch(self, *args, **kwargs):
"""Writes a batch of lines into the CSV.
Here each argument should be a list with the same length.
Arguments
---------
*args
Supply every field with a value in positional form OR.
**kwargs
Supply certain fields by key. The ID field is mandatory for all
lines, but others can be left empty.
"""
if args and kwargs:
raise ValueError(
"Use either positional fields or named fields, but not both.")
if args:
if len(args) != len(self.fields):
raise ValueError("Need consistent fields")
for arg_row in zip(*args):
self.write(*arg_row)
if kwargs:
if "ID" not in kwargs:
raise ValueError("I'll need to see some ID")
keys = kwargs.keys()
for value_row in zip(*kwargs.values()):
kwarg_row = dict(zip(keys, value_row))
self.write(**kwarg_row)
@staticmethod
def _expand_data_fields(data_fields):
expanded = []
for data_field in data_fields:
expanded.append(data_field)
expanded.append(data_field + "_format")
expanded.append(data_field + "_opts")
return expanded
def write_txt_file(data, filename, sampling_rate=None):
"""Write data in text format.
Arguments
---------
data : str, list, paddle.tensor, numpy.ndarray
The data to write in the text file.
filename : str
Path to file where to write the data.
sampling_rate : None
Not used, just here for interface compatibility.
Returns
-------
None
"""
del sampling_rate # Not used.
# Check if the path of filename exists
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "w") as fout:
if isinstance(data, paddle.Tensor):
data = data.tolist()
if isinstance(data, np.ndarray):
data = data.tolist()
if isinstance(data, list):
for line in data:
print(line, file=fout)
if isinstance(data, str):
print(data, file=fout)
def write_stdout(data, filename=None, sampling_rate=None):
"""Write data to standard output.
Arguments
---------
data : str, list, paddle.Tensor, numpy.ndarray
The data to write in the text file.
filename : None
Not used, just here for compatibility.
sampling_rate : None
Not used, just here for compatibility.
Returns
-------
None
"""
# Managing paddle.Tensor
if isinstance(data, paddle.Tensor):
data = data.tolist()
# Managing np.ndarray
if isinstance(data, np.ndarray):
data = data.tolist()
if isinstance(data, list):
for line in data:
print(line)
if isinstance(data, str):
print(data)
def length_to_mask(length, max_len=None, dtype=None, device=None):
"""Creates a binary mask for each sequence.
Arguments
---------
length : LongTensor
Containing the length of each sequence in the batch. Must be 1D.
max_len : int
Max length for the mask, also the size of the second dimension.
dtype : dtype, default: None
The dtype of the generated mask.
device: device, default: None
The device to put the mask variable.
Returns
-------
mask : tensor
The binary mask.
"""
assert len(length.shape) == 1
if max_len is None:
max_len = length.max().long().item() # using arange to generate mask
mask = paddle.arange(
max_len, dtype=length.dtype).expand(
[len(length), max_len]) < length.unsqueeze(1)
if dtype is None:
dtype = length.dtype
if device is None:
device = length.device
mask = paddle.to_tensor(mask, dtype=dtype)
return mask
def read_kaldi_lab(kaldi_ali, kaldi_lab_opts):
"""Read labels in kaldi format.
Uses kaldi IO.
Arguments
---------
kaldi_ali : str
Path to directory where kaldi alignments are stored.
kaldi_lab_opts : str
A string that contains the options for reading the kaldi alignments.
Returns
-------
lab : dict
A dictionary containing the labels.
Note
----
This depends on kaldi-io-for-python. Install it separately.
See: https://github.com/vesis84/kaldi-io-for-python
```
"""
# EXTRA TOOLS
try:
import kaldi_io
except ImportError:
raise ImportError("Could not import kaldi_io. Install it to use this.")
# Reading the Kaldi labels
lab = {
k: v
for k, v in kaldi_io.read_vec_int_ark(
"gunzip -c " + kaldi_ali + "/ali*.gz | " + kaldi_lab_opts + " " +
kaldi_ali + "/final.mdl ark:- ark:-|")
}
return lab
def get_md5(file):
"""Get the md5 checksum of an input file.
Arguments
---------
file : str
Path to file for which compute the checksum.
Returns
-------
md5
Checksum for the given filepath.
"""
# Lets read stuff in 64kb chunks!
BUF_SIZE = 65536
md5 = hashlib.md5()
# Computing md5
with open(file, "rb") as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
md5.update(data)
return md5.hexdigest()
def save_md5(files, out_file):
"""Saves the md5 of a list of input files as a pickled dict into a file.
Arguments
---------
files : list
List of input files from which we will compute the md5.
outfile : str
The path where to store the output pkl file.
Returns
-------
None
"""
# Initialization of the dictionary
md5_dict = {}
# Computing md5 for all the files in the list
for file in files:
md5_dict[file] = get_md5(file)
# Saving dictionary in pkl format
save_pkl(md5_dict, out_file)
def save_pkl(obj, file):
"""Save an object in pkl format.
Arguments
---------
obj : object
Object to save in pkl format
file : str
Path to the output file
sampling_rate : int
Sampling rate of the audio file, TODO: this is not used?
"""
with open(file, "wb") as f:
pickle.dump(obj, f)
def load_pkl(file):
"""Loads a pkl file.
For an example, see `save_pkl`.
Arguments
---------
file : str
Path to the input pkl file.
Returns
-------
The loaded object.
"""
# Deals with the situation where two processes are trying
# to access the same label dictionary by creating a lock
count = 100
while count > 0:
if os.path.isfile(file + ".lock"):
time.sleep(1)
count -= 1
else:
break
try:
open(file + ".lock", "w").close()
with open(file, "rb") as f:
return pickle.load(f)
finally:
if os.path.isfile(file + ".lock"):
os.remove(file + ".lock")
def prepend_bos_token(label, bos_index):
"""Create labels with <bos> token at the beginning.
Arguments
---------
label : IntTensor
Containing the original labels. Must be of size: [batch_size, max_length].
bos_index : int
The index for <bos> token.
Returns
-------
new_label : tensor
The new label with <bos> at the beginning.
"""
new_label = label.long().clone()
batch_size = label.shape[0]
bos = new_label.new_zeros(batch_size, 1).fill_(bos_index)
new_label = paddle.concat([bos, new_label], axis=1)
return new_label
def append_eos_token(label, length, eos_index):
"""Create labels with <eos> token appended.
Arguments
---------
label : IntTensor
Containing the original labels. Must be of size: [batch_size, max_length]
length : LongTensor
Containing the original length of each label sequences. Must be 1D.
eos_index : int
The index for <eos> token.
Returns
-------
new_label : tensor
The new label with <eos> appended.
"""
new_label = paddle.to_tensor(label, dtype="int32").clone()
batch_size = label.shape[0]
pad = paddle.zeros([batch_size, 1], dtype=new_label.dtype)
new_label = paddle.concat([new_label, pad], dim=1)
new_label[paddle.arange(batch_size), paddle.to_tensor(
length, dtype="int64")] = eos_index
return new_label
def merge_char(sequences, space="_"):
"""Merge characters sequences into word sequences.
Arguments
---------
sequences : list
Each item contains a list, and this list contains a character sequence.
space : string
The token represents space. Default: _
Returns
-------
The list contains word sequences for each sentence.
"""
results = []
for seq in sequences:
words = "".join(seq).split(space)
results.append(words)
return results
def merge_csvs(data_folder, csv_lst, merged_csv):
"""Merging several csv files into one file.
Arguments
---------
data_folder : string
The folder to store csv files to be merged and after merging.
csv_lst : list
Filenames of csv file to be merged.
merged_csv : string
The filename to write the merged csv file.
"""
write_path = os.path.join(data_folder, merged_csv)
if os.path.isfile(write_path):
logger.info("Skipping merging. Completed in previous run.")
with open(os.path.join(data_folder, csv_lst[0])) as f:
header = f.readline()
lines = []
for csv_file in csv_lst:
with open(os.path.join(data_folder, csv_file)) as f:
for i, line in enumerate(f):
if i == 0:
# Checking header
if line != header:
raise ValueError("Different header for "
f"{csv_lst[0]} and {csv}.")
continue
lines.append(line)
with open(write_path, "w") as f:
f.write(header)
for line in lines:
f.write(line)
logger.info(f"{write_path} is created.")
def split_word(sequences, space="_"):
"""Split word sequences into character sequences.
Arguments
---------
sequences : list
Each item contains a list, and this list contains a words sequence.
space : string
The token represents space. Default: _
Returns
-------
The list contains word sequences for each sentence.
"""
results = []
for seq in sequences:
chars = list(space.join(seq))
results.append(chars)
return results