|
|
@ -55,6 +55,7 @@ args = parser.parse_args()
|
|
|
|
def create_manifest(data_dir, manifest_path_prefix):
|
|
|
|
def create_manifest(data_dir, manifest_path_prefix):
|
|
|
|
print("Creating manifest %s ..." % manifest_path_prefix)
|
|
|
|
print("Creating manifest %s ..." % manifest_path_prefix)
|
|
|
|
json_lines = []
|
|
|
|
json_lines = []
|
|
|
|
|
|
|
|
reference_lines = []
|
|
|
|
transcript_path = os.path.join(data_dir, 'transcript',
|
|
|
|
transcript_path = os.path.join(data_dir, 'transcript',
|
|
|
|
'aishell_transcript_v0.8.txt')
|
|
|
|
'aishell_transcript_v0.8.txt')
|
|
|
|
transcript_dict = {}
|
|
|
|
transcript_dict = {}
|
|
|
@ -88,6 +89,7 @@ def create_manifest(data_dir, manifest_path_prefix):
|
|
|
|
duration = float(len(audio_data) / samplerate)
|
|
|
|
duration = float(len(audio_data) / samplerate)
|
|
|
|
text = transcript_dict[audio_id]
|
|
|
|
text = transcript_dict[audio_id]
|
|
|
|
json_lines.append(audio_path)
|
|
|
|
json_lines.append(audio_path)
|
|
|
|
|
|
|
|
reference_lines.append(str(total_num+1) + "\t" + text)
|
|
|
|
|
|
|
|
|
|
|
|
total_sec += duration
|
|
|
|
total_sec += duration
|
|
|
|
total_text += len(text)
|
|
|
|
total_text += len(text)
|
|
|
@ -98,6 +100,10 @@ def create_manifest(data_dir, manifest_path_prefix):
|
|
|
|
for line in json_lines:
|
|
|
|
for line in json_lines:
|
|
|
|
fout.write(line + '\n')
|
|
|
|
fout.write(line + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with codecs.open(manifest_path + ".text", 'w', 'utf-8') as fout:
|
|
|
|
|
|
|
|
for line in reference_lines:
|
|
|
|
|
|
|
|
fout.write(line + '\n')
|
|
|
|
|
|
|
|
|
|
|
|
manifest_dir = os.path.dirname(manifest_path_prefix)
|
|
|
|
manifest_dir = os.path.dirname(manifest_path_prefix)
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
|
|
|
|
def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
|
|
|
|