求助贴 | webrtc_ai交互 server -> client 推流失败问题

问题描述:

前端用 webrtc 采集和传输音频 后端用 python 的 aiortc 处理. client → server 音频和文本都没问题. 但是 server → client 的时候只返回了第一帧 (后端正常处理流程为 asr → nlu → tts → 返回 answer 这里为了方便测试我用本地音频文件生成frame返回 )

下面的aiortc的处理部分
我的预期是 将结果缓存到answer_frames 中. 后续再将每一帧返回.
日志 answer queue left: 1376 输出一次, 也就是只返回了第一帧

class AudioStreamTrack(MediaStreamTrack):
    kind = "audio"

    def __init__(self, track: MediaStreamTrack, target_rate=48000, target_channels=2, frame_duration_ms=20, test=True):
        super().__init__()
        self.track = track
        self.target_rate = target_rate
        self.target_channels = target_channels
        self.frame_size = int(target_rate * frame_duration_ms / 1000)
        self._timestamp = 0
        # self.processor = processor
        self.vad = webrtcvad.Vad(1)  # webrtc VAD实例
        self.silence_frames = 0  # 静音帧计数
        self.buffered_frames = []
        self.processing = False
        self.final_audio = None
        self.audio_convert = AudioConvert()
        self.test = test
        self.answer_frames = Queue()

    async def recv(self):
        try:
            frame = await self.track.recv()
            print('receive frame type:', type(frame))
            self.buffered_frames.append(frame)

            if not self.answer_frames.empty():
                answer = await self.answer_frames.get()
                print('answer queue left:', self.answer_frames.qsize())
                return answer

            # 使用 VAD 判断是否为静音
            is_silent = self.is_silence(frame)
            print('is_silence:', is_silent)
            if is_silent:
                logger.info('silence + 1')
                self.silence_frames += 1
            else:
                logger.info('speaking')
                self.silence_frames = 0

            if not self.processing and self.silence_frames > 30:
                self.processing = True

                if self.test:
                    print('build test 48k audio')
                    pcm = self.audio_convert.gen_audio_from_local('test_48k.wav')
                    asyncio.create_task(self.generate_answer_frames(pcm))

            return frame
        except Exception as e:
            logger.error('Error in recv: %s', str(e))
            raise

    async def generate_answer_frames(self, pcm_data):
        frame_list = self.convert(pcm_data)
        for frame in frame_list:
            await self.answer_frames.put(frame)
        print('generate done')
        asyncio.create_task(self.audio_convert.play_frames(frame_list, self.target_rate, self.target_channels))
        self.processing = False

  
1 个赞