Select OpenAI models support Images and Audio in addition to text as input.

1

Setup your virtual environment

python3 -m venv .venv
source .venv/bin/activate
2

Install dependencies

pip install -U openai duckduckgo-search agno
3

Export your OpenAI key

export OPENAI_API_KEY=***
4

Run the agent

python agent.py

Example: Audio Agent

from agno.agent import Agent
from agno.media import Audio
from agno.models.openai import OpenAIChat

url = "https://openaiassets.blob.core.windows.net/$web/API/docs/audio/alloy.wav"

agent = Agent(
    model=OpenAIChat(id="gpt-4o-audio-preview", modalities=["text"]),
    markdown=True,
)
agent.print_response("What is in this audio?", audio=[Audio(url=url, format="wav")])

Example: Image Agent

from agno.agent import Agent
from agno.media import Image
from agno.models.openai import OpenAIChat
from agno.tools.duckduckgo import DuckDuckGoTools

agent = Agent(
    model=OpenAIChat(id="gpt-4o"),
    tools=[DuckDuckGoTools()],
    markdown=True,
)

agent.print_response(
    "Tell me about this image and search the web for more information.",
    images=[
        Image(
            url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg"
        )
    ],
    stream=True,
)

View More Examples