Skip to main content
llama.cpp runs quantized GGUF models efficiently on CPUs. Ideal for resource-constrained environments.
from agno.agent import Agent
from agno.models.llama_cpp import LlamaCpp

agent = Agent(
    model=LlamaCpp(id="ggml-org/gpt-oss-20b-GGUF"),
    markdown=True,
)

agent.print_response("Explain neural networks", stream=True)

Tool Use

from agno.agent import Agent
from agno.models.llama_cpp import LlamaCpp
from agno.tools.websearch import WebSearchTools

agent = Agent(
    model=LlamaCpp(id="ggml-org/gpt-oss-20b-GGUF"),
    tools=[WebSearchTools()],
    markdown=True,
)

agent.print_response("What's happening in France?", stream=True)

Structured Output

from pydantic import BaseModel, Field
from agno.agent import Agent
from agno.models.llama_cpp import LlamaCpp

class Summary(BaseModel):
    title: str = Field(..., description="Title")
    points: list[str] = Field(..., description="Key points")

agent = Agent(
    model=LlamaCpp(id="ggml-org/gpt-oss-20b-GGUF"),
    output_schema=Summary,
)

agent.print_response("Summarize the benefits of open source software")

Run Examples

git clone https://github.com/agno-agi/agno.git
cd agno/cookbook/92_models/llama_cpp

python basic.py
python tool_use.py