This example demonstrates how to use Agno agents to generate streaming audio responses using OpenAI’s GPT-4o audio preview model.
Copy
Ask AI
import base64import wavefrom typing import Iteratorfrom agno.agent import Agent, RunOutputEventfrom agno.models.openai import OpenAIChat# Audio ConfigurationSAMPLE_RATE = 24000 # Hz (24kHz)CHANNELS = 1 # Mono (Change to 2 if Stereo)SAMPLE_WIDTH = 2 # Bytes (16 bits)# Provide the agent with the audio file and audio configuration and get result as text + audioagent = Agent( model=OpenAIChat( id="gpt-5-mini-audio-preview", modalities=["text", "audio"], audio={ "voice": "alloy", "format": "pcm16", }, # Only pcm16 is supported with streaming ),)output_stream: Iterator[RunOutputEvent] = agent.run( "Tell me a 10 second story", stream=True)filename = "tmp/response_stream.wav"# Open the file once in append-binary modewith wave.open(str(filename), "wb") as wav_file: wav_file.setnchannels(CHANNELS) wav_file.setsampwidth(SAMPLE_WIDTH) wav_file.setframerate(SAMPLE_RATE) # Iterate over generated audio for response in output_stream: response_audio = response.response_audio # type: ignore if response_audio: if response_audio.transcript: print(response_audio.transcript, end="", flush=True) if response_audio.content: try: pcm_bytes = base64.b64decode(response_audio.content) wav_file.writeframes(pcm_bytes) except Exception as e: print(f"Error decoding audio: {e}")print()
The example configures audio streaming with 24kHz sample rate, mono channel, and 16-bit sample width. The streaming approach allows for real-time audio playback while maintaining high audio quality through the PCM16 format.