Code

import base64
import wave
from agno.agent import Agent
from agno.models.openai import OpenAIChat
from typing import Iterator

# Audio Configuration
SAMPLE_RATE = 24000  # Hz (24kHz)
CHANNELS = 1  # Mono
SAMPLE_WIDTH = 2  # Bytes (16 bits)

agent = Agent(
    model=OpenAIChat(
        id="gpt-4o-audio-preview",
        modalities=["text", "audio"],
        audio={
            "voice": "alloy",
            "format": "pcm16",  # Required for streaming
        },
    ),
    debug_mode=True,
    add_history_to_messages=True,
)

# Question with streaming
output_stream: Iterator[RunResponse] = agent.run(
    "Is a golden retriever a good family dog?", 
    stream=True
)

with wave.open("tmp/answer_1.wav", "wb") as wav_file:
    wav_file.setnchannels(CHANNELS)
    wav_file.setsampwidth(SAMPLE_WIDTH)
    wav_file.setframerate(SAMPLE_RATE)
    
    for response in output_stream:
        if response.response_audio:
            if response.response_audio.transcript:
                print(response.response_audio.transcript, end="", flush=True)
            if response.response_audio.content:
                try:
                    pcm_bytes = base64.b64decode(response.response_audio.content)
                    wav_file.writeframes(pcm_bytes)
                except Exception as e:
                    print(f"Error decoding audio: {e}")
print()

Usage

1

Create a virtual environment

Open the Terminal and create a python virtual environment.

python3 -m venv .venv
source .venv/bin/activate
2

Set your API key

export OPENAI_API_KEY=xxx
3

Install libraries

pip install -U openai agno
4

Run Agent

python cookbook/agent_concepts/multimodal/audio_streaming.py