Skip to main content
vLLM provides high-throughput inference for large language models. Ideal for self-hosted production environments.
from agno.agent import Agent
from agno.models.vllm import VLLM

agent = Agent(
    model=VLLM(id="Qwen/Qwen2.5-7B-Instruct", top_k=20, enable_thinking=False),
    markdown=True,
)

agent.print_response("Explain distributed computing", stream=True)

Tool Use

from agno.agent import Agent
from agno.models.vllm import VLLM
from agno.tools.websearch import WebSearchTools

agent = Agent(
    model=VLLM(id="NousResearch/Nous-Hermes-2-Mistral-7B-DPO", top_k=20, enable_thinking=False),
    tools=[WebSearchTools()],
    markdown=True,
)

agent.print_response("What's the latest news in AI?", stream=True)

Structured Output

from pydantic import BaseModel, Field
from agno.agent import Agent
from agno.models.vllm import VLLM

class Summary(BaseModel):
    title: str = Field(..., description="Document title")
    key_points: list[str] = Field(..., description="Key points")
    conclusion: str = Field(..., description="Conclusion")

agent = Agent(
    model=VLLM(id="Qwen/Qwen2.5-7B-Instruct", top_k=20, enable_thinking=False),
    output_schema=Summary,
)

agent.print_response("Summarize the benefits of containerization")

Run Examples

# Start vLLM server
vllm serve Qwen/Qwen2.5-7B-Instruct

git clone https://github.com/agno-agi/agno.git
cd agno/cookbook/92_models/vllm

python basic.py
python tool_use.py