Agents can process images, audio, video, and files as input, and generate images and audio as output. This section introduction Multimodal I/O, checkout the full guide for more details.
Pass images via URL, file path, or base64 content:
from agno.agent import Agent
from agno.media import Image
from agno.models.openai import OpenAIResponses
agent = Agent(model=OpenAIResponses(id="gpt-5.2"))
# From URL
agent.run(
"What's in this image?",
images=[Image(url="https://example.com/photo.jpg")]
)
# From file
agent.run(
"Describe this image",
images=[Image(filepath="./photo.jpg")]
)
# Multiple images
agent.run(
"Compare these two images",
images=[
Image(url="https://example.com/photo1.jpg"),
Image(url="https://example.com/photo2.jpg")
]
)
Pass audio files for transcription or analysis:
from agno.agent import Agent
from agno.media import Audio
from agno.models.openai import OpenAIResponses
agent = Agent(
model=OpenAIResponses(id="gpt-4o-audio-preview", modalities=["text"])
)
# From file
agent.run(
"What is being said in this audio?",
audio=[Audio(filepath="./recording.wav")]
)
# From bytes
with open("recording.wav", "rb") as f:
audio_bytes = f.read()
agent.run(
"Transcribe this audio",
audio=[Audio(content=audio_bytes, format="wav")]
)
Pass video files for analysis:
from agno.agent import Agent
from agno.media import Video
from agno.models.google import Gemini
agent = Agent(model=Gemini(id="gemini-2.0-flash-exp"))
agent.run(
"Describe what happens in this video",
videos=[Video(filepath="./clip.mp4")]
)
Video input is currently supported by Gemini models only.
Pass documents like PDFs:
from agno.agent import Agent
from agno.media import File
from agno.models.anthropic import Claude
agent = Agent(model=Claude(id="claude-sonnet-4-5"))
# From URL
agent.run(
"Summarize this document",
files=[File(url="https://example.com/report.pdf")]
)
# From file path
agent.run(
"What are the key points in this PDF?",
files=[File(filepath="./report.pdf")]
)
Image Generation
Generate images using tools like DALL-E:
from agno.agent import Agent
from agno.models.openai import OpenAIResponses
from agno.tools.dalle import DalleTools
agent = Agent(
model=OpenAIResponses(id="gpt-5.2"),
tools=[DalleTools()],
)
agent.run("Generate an image of a sunset over mountains")
# Get generated images
images = agent.get_images()
for image in images:
print(image.url)
Audio Generation
Generate audio responses:
from agno.agent import Agent
from agno.models.openai import OpenAIResponses
from agno.utils.audio import write_audio_to_file
agent = Agent(
model=OpenAIResponses(
id="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "wav"},
),
)
response = agent.run("Tell me a short story")
# Save audio to file
if response.response_audio:
write_audio_to_file(
audio=response.response_audio.content,
filename="story.wav"
)
Process audio input and generate audio output:
from agno.agent import Agent
from agno.media import Audio
from agno.models.openai import OpenAIResponses
from agno.utils.audio import write_audio_to_file
agent = Agent(
model=OpenAIResponses(
id="gpt-4o-audio-preview",
modalities=["text", "audio"],
audio={"voice": "alloy", "format": "wav"},
),
)
response = agent.run(
"Respond to this message",
audio=[Audio(filepath="./question.wav")]
)
if response.response_audio:
write_audio_to_file(
audio=response.response_audio.content,
filename="response.wav"
)
| Class | Parameters |
|---|
Image | url, filepath, content (bytes) |
Audio | url, filepath, content (bytes), format |
Video | url, filepath, content (bytes) |
File | url, filepath, content (bytes) |
Learn More
For detailed guides and examples, see the Multimodal documentation: