This example demonstrates how tools can access and process files (PDFs, documents, etc.) passed to the agent. It shows uploading a PDF file, processing it with a custom tool, and having the LLM respond based on the extracted content.

Code

cookbook/agents/multimodal/file_input_for_tool.py
"""
Example showing how tools can access media (images, videos, audio, files) passed to the agent.

This demonstrates:
1. Uploading a PDF file to an agent
2. A tool that can access and process the uploaded file (OCR simulation)
3. The LLM responding based on the tool's processing result
"""

from typing import Optional, Sequence

from agno.agent import Agent
from agno.media import File
from agno.models.google import Gemini
from agno.tools import Toolkit


class DocumentProcessingTools(Toolkit):
    def __init__(self):
        tools = [
            self.extract_text_from_pdf,
        ]

        super().__init__(name="document_processing_tools", tools=tools)

    def extract_text_from_pdf(self, files: Optional[Sequence[File]] = None) -> str:
        """
        Extract text from uploaded PDF files using OCR.

        This tool can access any files that were passed to the agent.
        In a real implementation, you would use a proper OCR service.

        Args:
            files: Files passed to the agent (automatically injected)

        Returns:
            Extracted text from the PDF files
        """
        if not files:
            return "No files were uploaded to process."

        print(f"--> Files: {files}")

        extracted_texts = []
        for i, file in enumerate(files):
            if file.content:
                # Simulate OCR processing
                # In reality, you'd use a service like Tesseract, AWS Textract, etc.
                file_size = len(file.content)
                extracted_text = f"""
                    [SIMULATED OCR RESULT FOR FILE {i + 1}]
                    Document processed successfully!
                    File size: {file_size} bytes

                    Sample extracted content:
                    "This is a sample document with important information about quarterly sales figures.
                    Q1 Revenue: $125,000
                    Q2 Revenue: $150,000
                    Q3 Revenue: $175,000

                    The growth trend shows a 20% increase quarter over quarter."
                """
                extracted_texts.append(extracted_text)
            else:
                extracted_texts.append(
                    f"File {i + 1}: Content is empty or inaccessible."
                )

        return "\n\n".join(extracted_texts)


def create_sample_pdf_content() -> bytes:
    """Create a sample PDF-like content for demonstration."""
    # This is just sample binary content - in reality you'd have actual PDF bytes
    sample_content = """
    %PDF-1.4
    Sample PDF content for demonstration
    This would be actual PDF binary data in a real scenario
    """.encode("utf-8")
    return sample_content


def main():
    # Create an agent with document processing tools
    agent = Agent(
        model=Gemini(id="gemini-2.5-pro"),
        tools=[DocumentProcessingTools()],
        name="Document Processing Agent",
        description="An agent that can process uploaded documents. Use the tool to extract text from the PDF.",
        debug_mode=True,
        send_media_to_model=False,
        store_media=True,
    )

    print("=== Tool Media Access Example ===\n")

    # Example 1: PDF Processing
    print("1. Testing PDF processing...")

    # Create sample file content
    pdf_content = create_sample_pdf_content()
    sample_file = File(content=pdf_content)

    response = agent.run(
        input="I've uploaded a PDF document. Please extract the text from it and summarize the key financial information.",
        files=[sample_file],
        session_id="test_files",
    )

    print(f"Agent Response: {response.content}")
    print("\n" + "=" * 50 + "\n")


if __name__ == "__main__":
    main()

Usage

1

Create a virtual environment

Open the Terminal and create a python virtual environment.
python3 -m venv .venv
source .venv/bin/activate
2

Set your API key

export GOOGLE_API_KEY=xxx
# or for OpenAI
# export OPENAI_API_KEY=xxx
3

Install libraries

pip install -U agno google-generativeai
# or for OpenAI
# pip install -U agno openai
4

Run Agent

python cookbook/agents/multimodal/file_input_for_tool.py