WhatsApp agent that analyzes images, videos, and audio using multimodal AI
from agno.agent import Agent from agno.db.sqlite import SqliteDb from agno.models.google import Gemini from agno.os.app import AgentOS from agno.os.interfaces.whatsapp import Whatsapp agent_db = SqliteDb(db_file="tmp/persistent_memory.db") media_agent = Agent( name="Media Agent", model=Gemini(id="gemini-2.0-flash"), db=agent_db, add_history_to_context=True, num_history_runs=3, add_datetime_to_context=True, markdown=True, ) agent_os = AgentOS( agents=[media_agent], interfaces=[Whatsapp(agent=media_agent)], ) app = agent_os.get_app() if __name__ == "__main__": agent_os.serve(app="agent_with_media:app", reload=True)
Create a virtual environment
Terminal
python3 -m venv .venv source .venv/bin/activate
Set Environment Variables
export WHATSAPP_ACCESS_TOKEN=your_whatsapp_access_token export WHATSAPP_PHONE_NUMBER_ID=your_phone_number_id export WHATSAPP_WEBHOOK_URL=your_webhook_url export WHATSAPP_VERIFY_TOKEN=your_verify_token export GOOGLE_API_KEY=your_google_api_key export APP_ENV=development
Install libraries
pip install -U agno
Run Example
python cookbook/os/interfaces/whatsapp/agent_with_media.py