import subprocess
import time
from pathlib import Path
from agno.agent import Agent
from agno.media import Video
from agno.models.google import Gemini
from agno.utils.log import logger
from google.generativeai import get_file, upload_file
video_path = Path(__file__).parent.joinpath("sample.mp4")
output_dir = Path("tmp/shorts")
agent = Agent(
name="Video2Shorts",
description="Process videos and generate engaging shorts.",
model=Gemini(id="gemini-2.0-flash-exp"),
markdown=True,
debug_mode=True,
instructions=[
"Analyze the provided video directly—do NOT reference or analyze any external sources or YouTube videos.",
"Identify engaging moments that meet the specified criteria for short-form content.",
"""Provide your analysis in a **table format** with these columns:
- Start Time | End Time | Description | Importance Score""",
"Ensure all timestamps use MM:SS format and importance scores range from 1-10. ",
"Focus only on segments between 15 and 60 seconds long.",
"Base your analysis solely on the provided video content.",
"Deliver actionable insights to improve the identified segments for short-form optimization.",
],
)
# Upload and process video
video_file = upload_file(video_path)
while video_file.state.name == "PROCESSING":
time.sleep(2)
video_file = get_file(video_file.name)
# Multimodal Query for Video Analysis
query = """
You are an expert in video content creation, specializing in crafting engaging short-form content for platforms like YouTube Shorts and Instagram Reels. Your task is to analyze the provided video and identify segments that maximize viewer engagement.
For each video, you'll:
1. Identify key moments that will capture viewers' attention, focusing on:
- High-energy sequences
- Emotional peaks
- Surprising or unexpected moments
- Strong visual and audio elements
- Clear narrative segments with compelling storytelling
2. Extract segments that work best for short-form content, considering:
- Optimal length (strictly 15–60 seconds)
- Natural start and end points that ensure smooth transitions
- Engaging pacing that maintains viewer attention
- Audio-visual harmony for an immersive experience
- Vertical format compatibility and adjustments if necessary
3. Provide a detailed analysis of each segment, including:
- Precise timestamps (Start Time | End Time in MM:SS format)
- A clear description of why the segment would be engaging
- Suggestions on how to enhance the segment for short-form content
- An importance score (1-10) based on engagement potential
Your goal is to identify moments that are visually compelling, emotionally engaging, and perfectly optimized for short-form platforms.
"""
# Generate Video Analysis
response = agent.run(query, videos=[Video(content=video_file)])
# Create output directory
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
# Extract and cut video segments
def extract_segments(response_text):
import re
segments_pattern = r"\|\s*(\d+:\d+)\s*\|\s*(\d+:\d+)\s*\|\s*(.*?)\s*\|\s*(\d+)\s*\|"
segments: list[dict] = []
for match in re.finditer(segments_pattern, str(response_text)):
start_time = match.group(1)
end_time = match.group(2)
description = match.group(3)
score = int(match.group(4))
start_seconds = sum(x * int(t) for x, t in zip([60, 1], start_time.split(":")))
end_seconds = sum(x * int(t) for x, t in zip([60, 1], end_time.split(":")))
duration = end_seconds - start_seconds
if 15 <= duration <= 60 and score > 7:
output_path = output_dir / f"short_{len(segments) + 1}.mp4"
command = [
"ffmpeg",
"-ss",
str(start_seconds),
"-i",
video_path,
"-t",
str(duration),
"-vf",
"scale=1080:1920,setsar=1:1",
"-c:v",
"libx264",
"-c:a",
"aac",
"-y",
str(output_path),
]
try:
subprocess.run(command, check=True)
segments.append(
{"path": output_path, "description": description, "score": score}
)
except subprocess.CalledProcessError:
print(f"Failed to process segment: {start_time} - {end_time}")
return segments
logger.debug(f"{response.content}")
# Process segments
shorts = extract_segments(response.content)
# Print results
print("\n--- Generated Shorts ---")
for short in shorts:
print(f"Short at {short['path']}")
print(f"Description: {short['description']}")
print(f"Engagement Score: {short['score']}/10\n")
Create a virtual environment
Terminal
and create a python virtual environment.python3 -m venv .venv
source .venv/bin/activate
Set your API key
export GOOGLE_API_KEY=xxx
Install libraries
pip install -U opencv-python google-generativeai sqlalchemy ffmpeg-python agno
Install ffmpeg
brew install ffmpeg
Run Agent
python cookbook/agent_concepts/multimodal/video_to_shorts.py
Was this page helpful?