问题描述:
前端用 webrtc 采集和传输音频 后端用 python 的 aiortc 处理. client → server 音频和文本都没问题. 但是 server → client 的时候只返回了第一帧 (后端正常处理流程为 asr → nlu → tts → 返回 answer 这里为了方便测试我用本地音频文件生成frame返回 )
下面的aiortc的处理部分
我的预期是 将结果缓存到answer_frames 中. 后续再将每一帧返回.
日志 answer queue left: 1376 输出一次, 也就是只返回了第一帧
class AudioStreamTrack(MediaStreamTrack):
kind = "audio"
def __init__(self, track: MediaStreamTrack, target_rate=48000, target_channels=2, frame_duration_ms=20, test=True):
super().__init__()
self.track = track
self.target_rate = target_rate
self.target_channels = target_channels
self.frame_size = int(target_rate * frame_duration_ms / 1000)
self._timestamp = 0
# self.processor = processor
self.vad = webrtcvad.Vad(1) # webrtc VAD实例
self.silence_frames = 0 # 静音帧计数
self.buffered_frames = []
self.processing = False
self.final_audio = None
self.audio_convert = AudioConvert()
self.test = test
self.answer_frames = Queue()
async def recv(self):
try:
frame = await self.track.recv()
print('receive frame type:', type(frame))
self.buffered_frames.append(frame)
if not self.answer_frames.empty():
answer = await self.answer_frames.get()
print('answer queue left:', self.answer_frames.qsize())
return answer
# 使用 VAD 判断是否为静音
is_silent = self.is_silence(frame)
print('is_silence:', is_silent)
if is_silent:
logger.info('silence + 1')
self.silence_frames += 1
else:
logger.info('speaking')
self.silence_frames = 0
if not self.processing and self.silence_frames > 30:
self.processing = True
if self.test:
print('build test 48k audio')
pcm = self.audio_convert.gen_audio_from_local('test_48k.wav')
asyncio.create_task(self.generate_answer_frames(pcm))
return frame
except Exception as e:
logger.error('Error in recv: %s', str(e))
raise
async def generate_answer_frames(self, pcm_data):
frame_list = self.convert(pcm_data)
for frame in frame_list:
await self.answer_frames.put(frame)
print('generate done')
asyncio.create_task(self.audio_convert.play_frames(frame_list, self.target_rate, self.target_channels))
self.processing = False