Having trouble with debugging the webrtc-streamlit component

Hi,

I’m trying to use the webrtc component to stream audio in chunks to a transcription service but for the past 2 weeks I just can’t get it to work. Here is my code with my comments which I’ve tried to detail my thought process with but hopefully you can spot something wrong with them that I can’t

import streamlit as st
from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration
from pydub import AudioSegment
from authorised.logger import log
import time
import queue
import math

logger = log("rtc.log")
status_indicator = st.empty()
RTC_CONFIGURATION = RTCConfiguration({"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]})
# RTC_CONFIGURATION = RTCConfiguration({"iceServers": [{"urls": ["stun:stun.nottingham.ac.uk:3478", "stun:stun.node4.co.uk:3478"]}], "iceTransportPolicy": "relay"})


class RTCAudioTransmit:

    def __init__(self, audio_queue_size=8192):
        # The queue is half the size of audio_queue_size
        logger.info('RTCAudioTransmit.py - __init__')
        self.audio_queue_size = audio_queue_size
        status_indicator.write("Loading RTC...")
        self.web_rtc = webrtc_streamer(
            key="audio-only",
            mode=WebRtcMode.SENDONLY,
            rtc_configuration=RTC_CONFIGURATION,
#            audio_processor_factory=AudioProcessor
            audio_receiver_size=audio_queue_size,
            media_stream_constraints={"audio": True, "video": False}
        ) 

    def transmit(self):
        logger.info(f'RTCAudioTransmit.py - {str(self.web_rtc)}')
        while self.web_rtc.state.playing:
            # If buffer is half full
            if self.web_rtc.audio_receiver._frames_queue.qsize() > math.floor(self.audio_queue_size / 2):
                try:
                    status_indicator.write("Releasing Audio Chunk......")
                    logger.info(f'RTCAudioTransmit.py - {str(self.web_rtc)}')
                    # audio_seg is for appending other frame to.
                    audio_seg = AudioSegment.silent(duration=0)
                    # Get all frames.
                    for frame in self.web_rtc.audio_receiver.get_frames(timeout=6):
                        # Convert each frame into a audiosegment and append to audio_seg
                        audio_seg += self._return_audiosegments_from_audioframes(frame)
                        if audio_seg.duration_seconds > 5.0:
                            logger.info(f'RTCAudioTransmit.py - {self.audio_seg}')
                            for each_audio_segment in self.audio_seg:
                                # Create a  BytesIO object from the raw audio data.
                                buffer = BytesIO(each_audio_segment.raw_data)
                                buffer.name = "audio.wav"
                                # Save that BytesIO object as a wav format but in memory
                                received_rtc_audio.export(buffer, format="wav")
                                # Reset cursor
                                buffer.seek(0)
                                return transcribe(buffer)
                            # Init the audio_seg from above to empty
                            audio_seg = AudioSegment.silent(duration=0)
                except queue.Empty:
                    time.sleep(0.25)
                    status_indicator.write("Queue not full yet.")
                    continue
            elif not self.web_rtc.state.playing:
                status_indicator.write("AudioReciver is not detecting any microphone audio.")
                continue
            else:
                continue

    def _return_audiosegments_from_audioframes(self, af):
        logger.info(f'RTCAudioTransmit.py - {str(af)}')
        raw_audio_data = af.planes[0].to_bytes()
        return AudioSegment(data=raw_audio_data, sample_width=af.format.bytes, frame_rate=af.sample_rate, channels=len(af.layout.channels))

As you can see in this part return transcribe(buffer) I will send the audio chunks to the transcription service and return the JSON text but could this be the problem?

I did look at the example with the audio_processor_factory and thought maybe that would work better but I can’t seem to find any solid API documentation or manual only examples which don’t really explain what going on.

I’ve spent the past 2 weeks trying to debug this and now it’s driving me nuts and I have to ask for help.

Sop some of the code has parts in that might not make total sense as I’ve been commenting thing in/out and moving chunks around so feel free to point out things that don’t make sense.

So what would be the best way to chunk up audio into 5-15 second segments and send that segment to my transcribe(buffer) method?

Thanks.