You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
PaddleSpeech/demos/speech_web/speech_server/src/AudioManeger.py

154 lines
5.3 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import datetime
import os
import wave
import numpy as np
from .util import randName
class AudioMannger:
def __init__(self,
robot,
frame_length=160,
frame=10,
data_width=2,
vad_default=300):
# 二进制 pcm 流
self.audios = b''
self.asr_result = ""
# Speech 核心主体
self.robot = robot
self.file_dir = "source"
os.makedirs(self.file_dir, exist_ok=True)
self.vad_deafult = vad_default
self.vad_threshold = vad_default
self.vad_threshold_path = os.path.join(self.file_dir,
"vad_threshold.npy")
# 10ms 一帧
self.frame_length = frame_length
# 10帧检测一次 vad
self.frame = frame
# int 16, 两个bytes
self.data_width = data_width
# window
self.window_length = frame_length * frame * data_width
# 是否开始录音
self.on_asr = False
self.silence_cnt = 0
self.max_silence_cnt = 4
self.is_pause = False # 录音暂停与恢复
def init(self):
if os.path.exists(self.vad_threshold_path):
# 平均响度文件存在
self.vad_threshold = np.load(self.vad_threshold_path)
def clear_audio(self):
# 清空 pcm 累积片段与 asr 识别结果
self.audios = b''
def clear_asr(self):
self.asr_result = ""
def compute_chunk_volume(self, start_index, pcm_bins):
# 根据帧长计算能量平均值
pcm_bin = pcm_bins[start_index:start_index + self.window_length]
# 转成 numpy
pcm_np = np.frombuffer(pcm_bin, np.int16)
# 归一化 + 计算响度
x = pcm_np.astype(np.float32)
x = np.abs(x)
return np.mean(x)
def is_speech(self, start_index, pcm_bins):
# 检查是否没
if start_index > len(pcm_bins):
return False
# 检查从这个 start 开始是否为静音帧
energy = self.compute_chunk_volume(
start_index=start_index, pcm_bins=pcm_bins)
# print(energy)
if energy > self.vad_threshold:
return True
else:
return False
def compute_env_volume(self, pcm_bins):
max_energy = 0
start = 0
while start < len(pcm_bins):
energy = self.compute_chunk_volume(
start_index=start, pcm_bins=pcm_bins)
if energy > max_energy:
max_energy = energy
start += self.window_length
self.vad_threshold = max_energy + 100 if max_energy > self.vad_deafult else self.vad_deafult
# 保存成文件
np.save(self.vad_threshold_path, self.vad_threshold)
print(f"vad 阈值大小: {self.vad_threshold}")
print(f"环境采样保存: {os.path.realpath(self.vad_threshold_path)}")
def stream_asr(self, pcm_bin):
# 先把 pcm_bin 送进去做端点检测
start = 0
while start < len(pcm_bin):
if self.is_speech(start_index=start, pcm_bins=pcm_bin):
self.on_asr = True
self.silence_cnt = 0
print("录音中")
self.audios += pcm_bin[start:start + self.window_length]
else:
if self.on_asr:
self.silence_cnt += 1
if self.silence_cnt > self.max_silence_cnt:
self.on_asr = False
self.silence_cnt = 0
# 录音停止
print("录音停止")
# audios 保存为 wav, 送入 ASR
if len(self.audios) > 2 * 16000:
file_path = os.path.join(
self.file_dir,
"asr_" + datetime.datetime.strftime(
datetime.datetime.now(),
'%Y%m%d%H%M%S') + randName() + ".wav")
self.save_audio(file_path=file_path)
self.asr_result = self.robot.speech2text(file_path)
self.clear_audio()
return self.asr_result
else:
# 正常接收
print("录音中 静音")
self.audios += pcm_bin[start:start + self.window_length]
start += self.window_length
return ""
def save_audio(self, file_path):
print("保存音频")
wf = wave.open(file_path, 'wb') # 创建一个音频文件名字为“01.wav"
wf.setnchannels(1) # 设置声道数为2
wf.setsampwidth(2) # 设置采样深度为
wf.setframerate(16000) # 设置采样率为16000
# 将数据写入创建的音频文件
wf.writeframes(self.audios)
# 写完后将文件关闭
wf.close()
def end(self):
# audios 保存为 wav, 送入 ASR
file_path = os.path.join(self.file_dir, "asr.wav")
self.save_audio(file_path=file_path)
return self.robot.speech2text(file_path)
def stop(self):
self.is_pause = True
self.audios = b''
def resume(self):
self.is_pause = False