Skip to main content

Code

multimodal_workflow.py
from agno.agent import Agent
from agno.models.openai import OpenAIChat
from agno.os.app import AgentOS
from agno.os.interfaces.slack import Slack
from agno.tools.dalle import DalleTools
from agno.tools.websearch import WebSearchTools
from agno.workflow import Parallel, Step, Workflow

analyst = Agent(
    name="Visual Analyst",
    model=OpenAIChat(id="gpt-4o"),
    instructions=[
        "Analyze any images or files provided.",
        "Describe visual elements, composition, colors, mood.",
        "If no image, analyze the text topic visually.",
        "Keep analysis concise but detailed.",
    ],
    markdown=True,
)

researcher = Agent(
    name="Web Researcher",
    model=OpenAIChat(id="gpt-4o"),
    tools=[WebSearchTools()],
    instructions=[
        "Search the web for information related to the user's request.",
        "Provide relevant facts, trends, and context.",
        "Format results with markdown.",
    ],
    markdown=True,
)

synthesizer = Agent(
    name="Creative Synthesizer",
    model=OpenAIChat(id="gpt-4o"),
    tools=[DalleTools()],
    instructions=[
        "Combine the analysis and research from previous steps.",
        "If the user asked for an image, generate one with DALL-E.",
        "Provide a final comprehensive response.",
        "Format with markdown.",
    ],
    markdown=True,
)

analysis_step = Step(
    name="Visual Analysis",
    agent=analyst,
    description="Analyze input images/files or describe the topic visually",
)

research_step = Step(
    name="Web Research",
    agent=researcher,
    description="Search the web for related context and information",
)

research_phase = Parallel(
    analysis_step,
    research_step,
    name="Research Phase",
)

synthesis_step = Step(
    name="Creative Synthesis",
    agent=synthesizer,
    description="Combine analysis + research into a final response, generate images if requested",
)

creative_workflow = Workflow(
    name="Creative Pipeline",
    steps=[research_phase, synthesis_step],
)

agent_os = AgentOS(
    workflows=[creative_workflow],
    interfaces=[
        Slack(
            workflow=creative_workflow,
            streaming=True,
            reply_to_mentions_only=True,
            suggested_prompts=[
                {
                    "title": "Analyze",
                    "message": "Send me an image to analyze and research",
                },
                {
                    "title": "Create",
                    "message": "Research cyberpunk art trends and generate an image",
                },
                {
                    "title": "Compare",
                    "message": "Compare impressionism and expressionism art styles",
                },
            ],
        )
    ],
)
app = agent_os.get_app()


if __name__ == "__main__":
    agent_os.serve(app="multimodal_workflow:app", reload=True)

Usage

1

Set up your virtual environment

uv venv --python 3.12
source .venv/bin/activate
2

Set Environment Variables

export SLACK_TOKEN=xoxb-your-bot-user-token
export SLACK_SIGNING_SECRET=your-signing-secret
export OPENAI_API_KEY=your-openai-api-key
3

Install Dependencies

uv pip install -U "agno[slack]" openai
4

Run Example

python multimodal_workflow.py

Key Features

  • Parallel Execution: Visual analysis and web research run simultaneously in the Research Phase, reducing total response time
  • Three-Stage Pipeline: Analyze, research, then synthesize results into a final response with optional DALL-E image generation
  • Workflow-Driven Architecture: Uses Workflow, Step, and Parallel primitives to define a structured multi-agent pipeline
  • Suggested Prompts: Pre-configured prompt buttons in the Slack assistant drawer for common use cases