Not all models support multimodal inputs and outputs.
To see which models support multimodal inputs and outputs, please checkout the compatibility matrix.
Multimodal inputs to a team
Teams can process multimodal inputs by coordinating multiple specialized agents. Each agent can focus on specific aspects of the input, enabling sophisticated multimodal workflows.Image Analysis Team
Let’s create a team that analyzes images and creates structured output:image_to_structured_output.py
Copy
Ask AI
from typing import List
from agno.agent import Agent
from agno.media import Image
from agno.models.openai import OpenAIChat
from agno.team import Team
from pydantic import BaseModel, Field
class MovieScript(BaseModel):
name: str = Field(..., description="Give a name to this movie")
setting: str = Field(
..., description="Provide a nice setting for a blockbuster movie."
)
characters: List[str] = Field(..., description="Name of characters for this movie.")
storyline: str = Field(
..., description="3 sentence storyline for the movie. Make it exciting!"
)
image_analyst = Agent(
name="Image Analyst",
role="Analyze visual content and extract key elements",
model=OpenAIChat(id="gpt-4o"),
instructions=[
"Analyze images for visual elements, setting, and characters",
"Focus on details that can inspire creative content",
],
)
script_writer = Agent(
name="Script Writer",
role="Create structured movie scripts from visual inspiration",
model=OpenAIChat(id="gpt-4o"),
instructions=[
"Transform visual analysis into compelling movie concepts",
"Follow the structured output format precisely",
],
)
# Create a team for collaborative structured output generation
movie_team = Team(
name="Movie Script Team",
members=[image_analyst, script_writer],
model=OpenAIChat(id="gpt-4o"),
instructions=[
"Create structured movie scripts from visual content.",
"Image Analyst: First analyze the image for visual elements and context.",
"Script Writer: Transform analysis into structured movie concepts.",
"Ensure all output follows the MovieScript schema precisely.",
],
output_schema=MovieScript,
)
response = movie_team.run(
"Write a movie about this image",
images=[
Image(
url="https://upload.wikimedia.org/wikipedia/commons/0/0c/GoldenGateBridge-001.jpg"
)
],
stream=True,
)
Audio Processing Team
Teams can collaborate to transcribe and analyze audio content:audio_to_text.py
Copy
Ask AI
import requests
from agno.agent import Agent
from agno.media import Audio
from agno.models.google import Gemini
from agno.team import Team
transcription_specialist = Agent(
name="Transcription Specialist",
role="Convert audio to accurate text transcriptions",
model=Gemini(id="gemini-2.0-flash-exp"),
instructions=[
"Transcribe audio with high accuracy",
"Identify speakers clearly as Speaker A, Speaker B, etc.",
"Maintain conversation flow and context",
],
)
content_analyzer = Agent(
name="Content Analyzer",
role="Analyze transcribed content for insights",
model=Gemini(id="gemini-2.0-flash-exp"),
instructions=[
"Analyze transcription for key themes and insights",
"Provide summaries and extract important information",
],
)
# Create a team for collaborative audio-to-text processing
audio_team = Team(
name="Audio Analysis Team",
model=Gemini(id="gemini-2.0-flash-exp"),
members=[transcription_specialist, content_analyzer],
instructions=[
"Work together to transcribe and analyze audio content.",
"Transcription Specialist: First convert audio to accurate text with speaker identification.",
"Content Analyzer: Analyze transcription for insights and key themes.",
],
markdown=True,
)
url = "https://agno-public.s3.us-east-1.amazonaws.com/demo_data/QA-01.mp3"
response = requests.get(url)
audio_content = response.content
audio_team.print_response(
"Give a transcript of this audio conversation. Use speaker A, speaker B to identify speakers.",
audio=[Audio(content=audio_content)],
stream=True,
)
Audio Sentiment Analysis Team
Teams can combine transcription and sentiment analysis for deeper insights:audio_sentiment_analysis.py
Copy
Ask AI
import requests
from agno.agent import Agent
from agno.db.sqlite import SqliteDb
from agno.media import Audio
from agno.models.google import Gemini
from agno.team import Team
transcription_agent = Agent(
name="Audio Transcriber",
role="Transcribe audio conversations accurately",
model=Gemini(id="gemini-2.0-flash-exp"),
instructions=[
"Transcribe audio with speaker identification",
"Maintain conversation structure and flow",
],
)
sentiment_analyst = Agent(
name="Sentiment Analyst",
role="Analyze emotional tone and sentiment in conversations",
model=Gemini(id="gemini-2.0-flash-exp"),
instructions=[
"Analyze sentiment for each speaker separately",
"Identify emotional patterns and conversation dynamics",
"Provide detailed sentiment insights",
],
)
# Create a team for collaborative audio sentiment analysis
sentiment_team = Team(
name="Audio Sentiment Team",
members=[transcription_agent, sentiment_analyst],
model=Gemini(id="gemini-2.0-flash-exp"),
instructions=[
"Analyze audio sentiment with conversation memory.",
"Audio Transcriber: First transcribe audio with speaker identification.",
"Sentiment Analyst: Analyze emotional tone and conversation dynamics.",
],
add_history_to_context=True,
markdown=True,
db=SqliteDb(
session_table="audio_sentiment_team_sessions",
db_file="tmp/audio_sentiment_team.db",
),
)
url = "https://agno-public.s3.amazonaws.com/demo_data/sample_conversation.wav"
response = requests.get(url)
audio_content = response.content
sentiment_team.print_response(
"Give a sentiment analysis of this audio conversation. Use speaker A, speaker B to identify speakers.",
audio=[Audio(content=audio_content)],
stream=True,
)
sentiment_team.print_response(
"What else can you tell me about this audio conversation?",
stream=True,
)
Video Processing Team
Currently Agno only supports video as an input for Gemini models.
video_caption_generation.py
Copy
Ask AI
from agno.agent import Agent
from agno.models.openai import OpenAIChat
from agno.team import Team
from agno.tools.moviepy_video import MoviePyVideoTools
from agno.tools.openai import OpenAITools
video_processor = Agent(
name="Video Processor",
role="Handle video processing and audio extraction",
model=OpenAIChat(id="gpt-4o"),
tools=[MoviePyVideoTools(process_video=True, generate_captions=True)],
instructions=[
"Extract audio from videos for processing",
"Handle video file operations efficiently",
],
)
caption_generator = Agent(
name="Caption Generator",
role="Generate and embed captions in videos",
model=OpenAIChat(id="gpt-4o"),
tools=[MoviePyVideoTools(embed_captions=True), OpenAITools()],
instructions=[
"Transcribe audio to create accurate captions",
"Generate SRT format captions with proper timing",
"Embed captions seamlessly into videos",
],
)
# Create a team for collaborative video caption generation
caption_team = Team(
name="Video Caption Team",
members=[video_processor, caption_generator],
model=OpenAIChat(id="gpt-4o"),
description="Team that generates and embeds captions for videos",
instructions=[
"Process videos to generate captions in this sequence:",
"1. Extract audio from the video using extract_audio",
"2. Transcribe the audio using transcribe_audio",
"3. Generate SRT captions using create_srt",
"4. Embed captions into the video using embed_captions",
],
markdown=True,
)
caption_team.print_response(
"Generate captions for the video and embed them"
)
Multimodal outputs from a team
Teams can coordinate to generate multimodal outputs through specialized agents with different capabilities.Image Generation Team
The following example demonstrates how to generate images using a team with prompt engineering and DALL-E:generate_image_with_team.py
Copy
Ask AI
from typing import Iterator
from agno.agent import Agent, RunOutputEvent
from agno.models.openai import OpenAIChat
from agno.team import Team
from agno.tools.dalle import DalleTools
image_generator = Agent(
name="Image Creator",
role="Generate images using DALL-E",
model=OpenAIChat(id="gpt-4o"),
tools=[DalleTools()],
instructions=[
"Use the DALL-E tool to create high-quality images",
"Return image URLs in markdown format: ``",
],
)
prompt_engineer = Agent(
name="Prompt Engineer",
role="Optimize and enhance image generation prompts",
model=OpenAIChat(id="gpt-4o"),
instructions=[
"Enhance user prompts for better image generation results",
"Consider artistic style, composition, and technical details",
],
)
# Create a team for collaborative image generation
image_team = Team(
name="Image Generation Team",
model=OpenAIChat(id="gpt-4o"),
members=[prompt_engineer, image_generator],
instructions=[
"Generate high-quality images from user prompts.",
"Prompt Engineer: First enhance and optimize the user's prompt.",
"Image Creator: Generate images using the enhanced prompt with DALL-E.",
],
markdown=True,
)
run_stream: Iterator[RunOutputEvent] = image_team.run(
"Create an image of a yellow siamese cat",
stream=True,
stream_events=True,
)
for chunk in run_stream:
print(chunk)
Multimodal inputs and outputs together
Teams excel at combining multimodal inputs and outputs, enabling complex transformations and creative workflows.Image to Image Transformation Team
image_to_image_transformation.py
Copy
Ask AI
from agno.agent import Agent
from agno.models.openai import OpenAIChat
from agno.team import Team
from agno.tools.fal import FalTools
style_advisor = Agent(
name="Style Advisor",
role="Analyze and recommend artistic styles and transformations",
model=OpenAIChat(id="gpt-4o"),
instructions=[
"Analyze the input image and transformation request",
"Provide style recommendations and enhancement suggestions",
"Consider artistic elements like composition, lighting, and mood",
],
)
image_transformer = Agent(
name="Image Transformer",
role="Transform images using AI tools",
model=OpenAIChat(id="gpt-4o"),
tools=[FalTools()],
instructions=[
"Use the `image_to_image` tool to generate transformed images",
"Apply the recommended styles and transformations",
"Return the image URL as provided without markdown conversion",
],
)
# Create a team for collaborative image transformation
transformation_team = Team(
name="Image Transformation Team",
model=OpenAIChat(id="gpt-4o"),
members=[style_advisor, image_transformer],
instructions=[
"Transform images with artistic style and precision.",
"Style Advisor: First analyze transformation requirements and recommend styles.",
"Image Transformer: Apply transformations using AI tools with style guidance.",
],
markdown=True,
)
transformation_team.print_response(
"a cat dressed as a wizard with a background of a mystic forest. Make it look like 'https://fal.media/files/koala/Chls9L2ZnvuipUTEwlnJC.png'",
stream=True,
)
Image to Text Story Team
image_to_text.py
Copy
Ask AI
from pathlib import Path
from agno.agent import Agent
from agno.media import Image
from agno.models.openai import OpenAIChat
from agno.team import Team
image_analyzer = Agent(
name="Image Analyst",
role="Analyze and describe images in detail",
model=OpenAIChat(id="gpt-4o"),
instructions=[
"Analyze images carefully and provide detailed descriptions",
"Focus on visual elements, composition, and key details",
],
)
creative_writer = Agent(
name="Creative Writer",
role="Create engaging stories and narratives",
model=OpenAIChat(id="gpt-4o"),
instructions=[
"Transform image descriptions into compelling fiction stories",
"Use vivid language and creative storytelling techniques",
],
)
# Create a team for collaborative image-to-text processing
image_team = Team(
name="Image Story Team",
model=OpenAIChat(id="gpt-4o"),
members=[image_analyzer, creative_writer],
instructions=[
"Work together to create compelling fiction stories from images.",
"Image Analyst: First analyze the image for visual details and context.",
"Creative Writer: Transform the analysis into engaging fiction narratives.",
"Ensure the story captures the essence and mood of the image.",
],
markdown=True,
)
image_path = Path(__file__).parent.joinpath("sample.jpg")
image_team.print_response(
"Write a 3 sentence fiction story about the image",
images=[Image(filepath=image_path)],
)
Team Tools with Media Access
Team tools can access media (images, videos, audio, files) passed to the team, enabling direct processing without delegation:media_input_for_tool.py
Copy
Ask AI
from typing import Optional, Sequence
from agno.agent import Agent
from agno.media import File
from agno.models.google import Gemini
from agno.team import Team
from agno.tools import Toolkit
class DocumentProcessingTools(Toolkit):
def __init__(self):
tools = [
self.extract_text_from_pdf,
]
super().__init__(name="document_processing_tools", tools=tools)
def extract_text_from_pdf(self, files: Optional[Sequence[File]] = None) -> str:
"""
Extract text from uploaded PDF files using OCR.
Args:
files: Files passed to the team (automatically injected)
Returns:
Extracted text from the PDF files
"""
if not files:
return "No files were uploaded to process."
extracted_texts = []
for i, file in enumerate(files):
if file.content:
# Process file content (e.g., OCR, parsing)
file_size = len(file.content)
extracted_text = f"Processed file {i + 1}: {file_size} bytes"
extracted_texts.append(extracted_text)
return "\n".join(extracted_texts)
member_agent = Agent(
model=Gemini(id="gemini-2.5-pro"),
name="Assistant",
description="A general assistant agent.",
)
# Create a team with document processing tools
team = Team(
members=[member_agent],
model=Gemini(id="gemini-2.5-pro"),
tools=[DocumentProcessingTools()],
name="Document Processing Team",
instructions=[
"When files are uploaded, use the extract_text_from_pdf tool to process them.",
"Analyze the extracted content and provide insights directly in your response.",
],
send_media_to_model=False,
store_media=True,
)
pdf_content = b"Sample PDF content"
sample_file = File(content=pdf_content)
response = team.run(
input="Extract text from the uploaded PDF and analyze it.",
files=[sample_file],
)
Developer Resources
- View Image Team Examples
- View Audio Team Examples
- View Video Team Examples