This example demonstrates how a team can collaborate to analyze images and create structured movie scripts using Pydantic models for consistent output format.

Code

cookbook/examples/teams/multimodal/image_to_structured_output.py
from typing import List

from agno.agent import Agent
from agno.media import Image
from agno.models.openai import OpenAIChat
from agno.team import Team
from pydantic import BaseModel, Field
from rich.pretty import pprint


class MovieScript(BaseModel):
    name: str = Field(..., description="Give a name to this movie")
    setting: str = Field(
        ..., description="Provide a nice setting for a blockbuster movie."
    )
    characters: List[str] = Field(..., description="Name of characters for this movie.")
    storyline: str = Field(
        ..., description="3 sentence storyline for the movie. Make it exciting!"
    )


image_analyst = Agent(
    name="Image Analyst",
    role="Analyze visual content and extract key elements",
    model=OpenAIChat(id="gpt-5-mini"),
    instructions=[
        "Analyze images for visual elements, setting, and characters",
        "Focus on details that can inspire creative content",
    ],
)

script_writer = Agent(
    name="Script Writer",
    role="Create structured movie scripts from visual inspiration",
    model=OpenAIChat(id="gpt-5-mini"),
    instructions=[
        "Transform visual analysis into compelling movie concepts",
        "Follow the structured output format precisely",
    ],
)

# Create a team for collaborative structured output generation
movie_team = Team(
    name="Movie Script Team",
    members=[image_analyst, script_writer],
    model=OpenAIChat(id="gpt-5-mini"),
    instructions=[
        "Create structured movie scripts from visual content.",
        "Image Analyst: First analyze the image for visual elements and context.",
        "Script Writer: Transform analysis into structured movie concepts.",
        "Ensure all output follows the MovieScript schema precisely.",
    ],
    output_schema=MovieScript,
)

response = movie_team.run(
    "Write a movie about this image",
    images=[
        Image(
            url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg"
        )
    ],
    stream=True,
)

for event in response:
    pprint(event.content)

Usage

1

Create a virtual environment

Open the Terminal and create a python virtual environment.
python3 -m venv .venv
source .venv/bin/activate
2

Install required libraries

pip install agno pydantic rich
3

Set environment variables

export OPENAI_API_KEY=****
4

Run the agent

python cookbook/examples/teams/multimodal/image_to_structured_output.py