Skip to main content
Agents can process images, audio, video, and files as input, and generate images and audio as output. This section introduction Multimodal I/O, checkout the full guide for more details.

Image Input

Pass images via URL, file path, or base64 content:
from agno.agent import Agent
from agno.media import Image
from agno.models.openai import OpenAIResponses

agent = Agent(model=OpenAIResponses(id="gpt-5.2"))

# From URL
agent.run(
    "What's in this image?",
    images=[Image(url="https://example.com/photo.jpg")]
)

# From file
agent.run(
    "Describe this image",
    images=[Image(filepath="./photo.jpg")]
)

# Multiple images
agent.run(
    "Compare these two images",
    images=[
        Image(url="https://example.com/photo1.jpg"),
        Image(url="https://example.com/photo2.jpg")
    ]
)

Audio Input

Pass audio files for transcription or analysis:
from agno.agent import Agent
from agno.media import Audio
from agno.models.openai import OpenAIResponses

agent = Agent(
    model=OpenAIResponses(id="gpt-4o-audio-preview", modalities=["text"])
)

# From file
agent.run(
    "What is being said in this audio?",
    audio=[Audio(filepath="./recording.wav")]
)

# From bytes
with open("recording.wav", "rb") as f:
    audio_bytes = f.read()

agent.run(
    "Transcribe this audio",
    audio=[Audio(content=audio_bytes, format="wav")]
)

Video Input

Pass video files for analysis:
from agno.agent import Agent
from agno.media import Video
from agno.models.google import Gemini

agent = Agent(model=Gemini(id="gemini-2.0-flash-exp"))

agent.run(
    "Describe what happens in this video",
    videos=[Video(filepath="./clip.mp4")]
)
Video input is currently supported by Gemini models only.

File Input

Pass documents like PDFs:
from agno.agent import Agent
from agno.media import File
from agno.models.anthropic import Claude

agent = Agent(model=Claude(id="claude-sonnet-4-5"))

# From URL
agent.run(
    "Summarize this document",
    files=[File(url="https://example.com/report.pdf")]
)

# From file path
agent.run(
    "What are the key points in this PDF?",
    files=[File(filepath="./report.pdf")]
)

Image Generation

Generate images using tools like DALL-E:
from agno.agent import Agent
from agno.models.openai import OpenAIResponses
from agno.tools.dalle import DalleTools

agent = Agent(
    model=OpenAIResponses(id="gpt-5.2"),
    tools=[DalleTools()],
)

agent.run("Generate an image of a sunset over mountains")

# Get generated images
images = agent.get_images()
for image in images:
    print(image.url)

Audio Generation

Generate audio responses:
from agno.agent import Agent
from agno.models.openai import OpenAIResponses
from agno.utils.audio import write_audio_to_file

agent = Agent(
    model=OpenAIResponses(
        id="gpt-4o-audio-preview",
        modalities=["text", "audio"],
        audio={"voice": "alloy", "format": "wav"},
    ),
)

response = agent.run("Tell me a short story")

# Save audio to file
if response.response_audio:
    write_audio_to_file(
        audio=response.response_audio.content,
        filename="story.wav"
    )

Combined Input and Output

Process audio input and generate audio output:
from agno.agent import Agent
from agno.media import Audio
from agno.models.openai import OpenAIResponses
from agno.utils.audio import write_audio_to_file

agent = Agent(
    model=OpenAIResponses(
        id="gpt-4o-audio-preview",
        modalities=["text", "audio"],
        audio={"voice": "alloy", "format": "wav"},
    ),
)

response = agent.run(
    "Respond to this message",
    audio=[Audio(filepath="./question.wav")]
)

if response.response_audio:
    write_audio_to_file(
        audio=response.response_audio.content,
        filename="response.wav"
    )

Media Classes

ClassParameters
Imageurl, filepath, content (bytes)
Audiourl, filepath, content (bytes), format
Videourl, filepath, content (bytes)
Fileurl, filepath, content (bytes)

Learn More

For detailed guides and examples, see the Multimodal documentation: