import base64
import wave
from agno.agent import Agent
from agno.models.openai import OpenAIChat
from typing import Iterator
# Audio Configuration
SAMPLE_RATE = 24000 # Hz (24kHz)
CHANNELS = 1 # Mono
SAMPLE_WIDTH = 2 # Bytes (16 bits)
agent = Agent(
model=OpenAIChat(
id="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={
"voice": "alloy",
"format": "pcm16", # Required for streaming
},
),
debug_mode=True,
add_history_to_messages=True,
)
# Question with streaming
output_stream: Iterator[RunResponse] = agent.run(
"Is a golden retriever a good family dog?",
stream=True
)
with wave.open("tmp/answer_1.wav", "wb") as wav_file:
wav_file.setnchannels(CHANNELS)
wav_file.setsampwidth(SAMPLE_WIDTH)
wav_file.setframerate(SAMPLE_RATE)
for response in output_stream:
if response.response_audio:
if response.response_audio.transcript:
print(response.response_audio.transcript, end="", flush=True)
if response.response_audio.content:
try:
pcm_bytes = base64.b64decode(response.response_audio.content)
wav_file.writeframes(pcm_bytes)
except Exception as e:
print(f"Error decoding audio: {e}")
print()
Create a virtual environment
Open the Terminal
and create a python virtual environment.
python3 -m venv .venv
source .venv/bin/activate
Set your API key
export OPENAI_API_KEY=xxx
Install libraries
pip install -U openai agno
Run Agent
python cookbook/agent_concepts/multimodal/audio_streaming.py
Was this page helpful?