>
┌──────────────────┐
│ Nginx (HTTPS) │
│ 限流 / 认证 │
└────────┬─────────┘
│
┌──────────────────┼──────────────────┐
▼ ▼ ▼
┌────────────┐ ┌────────────┐ ┌────────────┐
│ FastAPI P1 │ │ FastAPI P2 │ │ FastAPI P3 │
│ (uvicorn) │ │ (uvicorn) │ │ (uvicorn) │
└──────┬──────┘ └──────┬──────┘ └──────┬──────┘
│ │ │
└──────────────────┼──────────────────┘
│
┌──────────────────▼──────────────────┐
│ 模型路由层 (Router) │
│ 负载均衡 / 熔断 / 模型选择 │
└──────────────────┬──────────────────┘
│
┌────────────────────────────┼────────────────────────────┐
▼ ▼ ▼
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Qwen-72B │ │ Llama3-70B │ │ Claude-3.5 │
│ (Ollama) │ │ (vLLM) │ │ (API) │
└─────────────┘ └─────────────┘ └─────────────┘
# 操作系统:Ubuntu 22.04 LTS (推荐)
# 硬件:NVIDIA A100 80GB × 1(最低配置)
# 1. 更新系统并安装基础依赖
sudo apt update && sudo apt upgrade -y
sudo apt install -y \
build-essential \
git \
curl \
wget \
vim \
htop \
net-tools \
ca-certificates \
gnupg \
lsb-release
# 2. 安装 NVIDIA 驱动(检查是否已安装)
nvidia-smi # 如果输出 GPU 信息,跳过此步
# 如果未安装,执行:
sudo apt install -y nvidia-driver-535 # 535 是长期支持版本
sudo systemctl reboot # 重启后生效
# 验证驱动安装
nvidia-smi
# 预期输出类似:
# +-----------------------------------------------------------------------------+
# | NVIDIA-SMI 535.54.03 Driver Version: 535.54.03 CUDA Version: 12.2 |
# |-------------------------------+----------------------+----------------------+
# | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
# | 0 NVIDIA A100-80GB Off | 00000000:00:1E.0 Off | 0 |
# +-------------------------------+----------------------+----------------------+
# 3. 安装 CUDA Toolkit 12.1(vLLM 需要)
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/"
sudo apt update
sudo apt install -y cuda-toolkit-12-1
# 4. 安装 cuDNN 8.9(深度学习必需)
sudo apt install -y libcudnn8=8.9.7.0-1+cuda12.1 \
libcudnn8-dev=8.9.7.0-1+cuda12.1
# 5. 安装 Python 3.11(推荐)
sudo add-apt-repository ppa:deadsnakes/ppa
sudo apt update
sudo apt install -y python3.11 python3.11-venv python3.11-dev
# 验证
python3.11 --version # Python 3.11.9
# 6. 设置 Python 软链接
sudo ln -sf /usr/bin/python3.11 /usr/bin/python3
sudo ln -sf /usr/bin/python3.11 /usr/bin/python
python --version
# 创建虚拟环境(推荐)
python3 -m venv /opt/llm-serve/venv
source /opt/llm-serve/venv/bin/activate
# 安装 PyTorch 2.2(CUDA 12.1 兼容版本)
pip install torch==2.2.0 torchvision torchaudio \
--index-url https://download.pytorch.org/whl/cu121
# 安装 vLLM(高性能推理引擎)
pip install vllm==0.4.0
# 安装 FastAPI 及相关依赖
pip install \
fastapi==0.111.0 \
uvicorn[standard]==0.29.0 \
httpx==0.27.0 \
pydantic==2.7.0 \
pydantic-settings==2.2.1 \
python-multipart==0.0.9 \
asyncio==3.4.3 \
aiofiles==23.2.1 \
prometheus-client==0.20.0 \
opentelemetry-api==1.24.0 \
opentelemetry-sdk==1.24.0
# 安装 API 文档相关
pip install \
sveltely \
tailwindcss
# 验证安装
python -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')"
# 预期输出:PyTorch: 2.2.0, CUDA: True
# 项目目录结构
# /opt/llm-serve/
# ├── app/
# │ ├── __init__.py
# │ ├── main.py # FastAPI 入口
# │ ├── config.py # 配置管理
# │ ├── router.py # 模型路由
# │ ├── models/
# │ │ ├── __init__.py
# │ │ ├── ollama_model.py # Ollama 模型封装
# │ │ ├── vllm_model.py # vLLM 模型封装
# │ │ └── openai_model.py # OpenAI 兼容模型封装
# │ ├── middleware/
# │ │ ├── __init__.py
# │ │ ├── rate_limit.py # 限流中间件
# │ │ └── logging.py # 日志中间件
# │ └── utils/
# │ ├── __init__.py
# │ └── metrics.py # 监控指标
# ├── models/ # 本地模型文件
# ├── logs/ # 日志目录
# ├── requirements.txt
# ├── Dockerfile
# └── docker-compose.yaml
# app/config.py
from pydantic_settings import BaseSettings
from functools import lru_cache
from typing import Dict, List, Optional
class ModelConfig(BaseSettings):
"""单个模型的配置"""
name: str # 模型标识名
type: str # "ollama" | "vllm" | "openai"
base_url: Optional[str] = None # Ollama/vLLM 的地址
api_key: Optional[str] = None # OpenAI API Key
model_id: str # 具体模型名,如 "qwen2-72b"
max_tokens: int = 8192 # 最大生成长度
temperature: float = 0.7 # 默认温度
timeout: int = 120 # 超时秒数
priority: int = 1 # 路由优先级(数字越小越高)
enabled: bool = True # 是否启用
class Settings(BaseSettings):
"""全局配置"""
app_name: str = "LLM Serving API"
version: str = "1.0.0"
debug: bool = False
# 服务器配置
host: str = "0.0.0.0"
port: int = 8000
workers: int = 4 # uvicorn worker 数
# 限流配置
rate_limit_requests: int = 100 # 每窗口请求数
rate_limit_window: int = 60 # 窗口秒数
# 模型配置列表
models: List[ModelConfig] = [
# 默认配置(可根据实际修改)
ModelConfig(
name="qwen72b",
type="ollama",
base_url="http://localhost:11434",
model_id="qwen2-72b",
max_tokens=8192,
priority=1
),
ModelConfig(
name="llama70b",
type="vllm",
base_url="http://localhost:8000",
model_id="meta-llama/Llama-3-70b-instruct",
max_tokens=8192,
priority=2
),
ModelConfig(
name="gpt4",
type="openai",
api_key="sk-xxxx",
model_id="gpt-4-turbo",
max_tokens=4096,
priority=3
),
]
# Redis(用于缓存和限流计数)
redis_url: str = "redis://localhost:6379/0"
# 日志配置
log_level: str = "INFO"
log_file: str = "/opt/llm-serve/logs/app.log"
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
@lru_cache()
def get_settings() -> Settings:
"""单例模式获取配置"""
return Settings()
# app/models/base.py
from abc import ABC, abstractmethod
from typing import AsyncIterator, Optional, Dict, Any
import httpx
import json
class BaseLLMModel(ABC):
"""
LLM 模型基类,定义统一接口
所有模型封装(Ollama/vLLM/OpenAI)都继承此基类
"""
def __init__(
self,
base_url: str,
model_id: str,
api_key: Optional[str] = None,
max_tokens: int = 8192,
temperature: float = 0.7,
timeout: int = 120
):
self.base_url = base_url.rstrip('/')
self.model_id = model_id
self.api_key = api_key
self.max_tokens = max_tokens
self.temperature = temperature
self.timeout = timeout
self._client: Optional[httpx.AsyncClient] = None
@abstractmethod
async def chat(
self,
messages: list,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
stream: bool = False
) -> Dict[str, Any] | AsyncIterator[str]:
"""
发送对话请求
Args:
messages: [{"role": "user", "content": "..."}, ...]
temperature: 温度参数(覆盖默认值)
max_tokens: 最大生成长度(覆盖默认值)
stream: 是否流式输出
Returns:
非流式:{"content": "生成的文本", "usage": {...}}
流式:AsyncIterator[str] 逐 tokenyeild
"""
pass
async def close(self):
"""关闭 HTTP 客户端"""
if self._client:
await self._client.aclose()
# app/models/ollama_model.py
from .base import BaseLLMModel
from typing import AsyncIterator, Dict, Any
import httpx
class OllamaModel(BaseLLMModel):
"""
Ollama 模型封装
Ollama API: http://localhost:11434
"""
async def _get_client(self) -> httpx.AsyncClient:
if self._client is None:
self._client = httpx.AsyncClient(timeout=self.timeout)
return self._client
async def chat(
self,
messages: list,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
stream: bool = False
) -> Dict[str, Any] | AsyncIterator[str]:
"""
Ollama /api/chat 接口
请求格式:
POST /api/chat
{
"model": "qwen2-72b",
"messages": [{"role": "user", "content": "..."}],
"stream": false,
"options": {
"temperature": 0.7,
"num_predict": 8192
}
}
"""
client = await self._get_client()
temperature = temperature or self.temperature
max_tokens = max_tokens or self.max_tokens
payload = {
"model": self.model_id,
"messages": messages,
"stream": stream,
"options": {
"temperature": temperature,
"num_predict": max_tokens,
}
}
if stream:
# 流式响应
async def stream_generator():
async with client.stream(
"POST",
f"{self.base_url}/api/chat",
json=payload
) as response:
async for line in response.aiter_lines():
if line:
data = json.loads(line)
if "message" in data:
yield data["message"]["content"]
if data.get("done", False):
break
return stream_generator()
else:
# 非流式响应
response = await client.post(
f"{self.base_url}/api/chat",
json=payload
)
response.raise_for_status()
data = response.json()
return {
"content": data["message"]["content"],
"usage": {
"prompt_tokens": data.get("prompt_eval_count", 0),
"completion_tokens": data.get("eval_count", 0),
},
"model": self.model_id
}
# app/models/vllm_model.py
from .base import BaseLLMModel
from typing import AsyncIterator, Dict, Any
import httpx
import json
class VLLMModel(BaseLLMModel):
"""
vLLM 模型封装
vLLM 提供 OpenAI 兼容 API:http://localhost:8000/v1/chat/completions
"""
async def _get_client(self) -> httpx.AsyncClient:
if self._client is None:
headers = {}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
self._client = httpx.AsyncClient(
timeout=self.timeout,
headers=headers
)
return self._client
async def chat(
self,
messages: list,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
stream: bool = False
) -> Dict[str, Any] | AsyncIterator[str]:
"""
vLLM OpenAI 兼容接口
POST /v1/chat/completions
"""
client = await self._get_client()
temperature = temperature or self.temperature
max_tokens = max_tokens or self.max_tokens
payload = {
"model": self.model_id,
"messages": messages,
"stream": stream,
"temperature": temperature,
"max_tokens": max_tokens,
}
if stream:
async def stream_generator():
async with client.stream(
"POST",
f"{self.base_url}/v1/chat/completions",
json=payload
) as response:
async for line in response.aiter_lines():
if line and line.startswith("data: "):
if line.strip() == "data: [DONE]":
break
data = json.loads(line[6:])
if "choices" in data and len(data["choices"]) > 0:
delta = data["choices"][0].get("delta", {})
if "content" in delta:
yield delta["content"]
return stream_generator()
else:
response = await client.post(
f"{self.base_url}/v1/chat/completions",
json=payload
)
response.raise_for_status()
data = response.json()
return {
"content": data["choices"][0]["message"]["content"],
"usage": data.get("usage", {}),
"model": data.get("model", self.model_id)
}
# app/router.py
from fastapi import APIRouter, HTTPException, BackgroundTasks
from pydantic import BaseModel, Field
from typing import List, Optional, AsyncIterator, Literal
import asyncio
import httpx
import json
from app.config import get_settings, ModelConfig
router = APIRouter(prefix="/v1", tags=["llm"])
class Message(BaseModel):
role: Literal["system", "user", "assistant"]
content: str
class ChatRequest(BaseModel):
model: str = Field(..., description="模型名称,如 qwen72b, llama70b")
messages: List[Message]
temperature: Optional[float] = Field(None, ge=0.0, le=2.0)
max_tokens: Optional[int] = Field(None, gt=0, le=128000)
stream: bool = Field(default=False)
timeout: Optional[int] = Field(default=120, gt=0)
class ChatResponse(BaseModel):
model: str
content: str
usage: dict
latency_ms: float
class StreamChunk(BaseModel):
model: str
content: str
# 模型实例缓存(延迟初始化)
_model_instances = {}
def get_model_instance(config: ModelConfig):
"""获取或创建模型实例(单例)"""
if config.name not in _model_instances:
if config.type == "ollama":
from app.models.ollama_model import OllamaModel
_model_instances[config.name] = OllamaModel(
base_url=config.base_url,
model_id=config.model_id,
max_tokens=config.max_tokens,
temperature=config.temperature,
timeout=config.timeout
)
elif config.type == "vllm":
from app.models.vllm_model import VLLMModel
_model_instances[config.name] = VLLMModel(
base_url=config.base_url,
model_id=config.model_id,
max_tokens=config.max_tokens,
temperature=config.temperature,
timeout=config.timeout
)
elif config.type == "openai":
from app.models.openai_model import OpenAIModel
_model_instances[config.name] = OpenAIModel(
base_url="https://api.openai.com/v1",
api_key=config.api_key,
model_id=config.model_id,
max_tokens=config.max_tokens,
temperature=config.temperature,
timeout=config.timeout
)
return _model_instances[config.name]
def find_model_config(model_name: str) -> ModelConfig:
"""根据模型名查找配置(支持别名)"""
settings = get_settings()
# 直接匹配
for config in settings.models:
if config.name == model_name and config.enabled:
return config
# 模糊匹配(用户可能输入 model_id)
for config in settings.models:
if config.model_id == model_name and config.enabled:
return config
raise HTTPException(
status_code=404,
detail=f"Model '{model_name}' not found or disabled. "
f"Available: {[m.name for m in settings.models if m.enabled]}"
)
@router.post("/chat/completions")
async def chat_completions(request: ChatRequest):
"""
统一的 Chat Completion 接口
兼容 OpenAI API 格式
"""
import time
start_time = time.time()
# 1. 查找模型配置
config = find_model_config(request.model)
model = get_model_instance(config)
# 2. 准备消息
messages = [{"role": m.role, "content": m.content} for m in request.messages]
try:
# 3. 调用模型
result = await model.chat(
messages=messages,
temperature=request.temperature,
max_tokens=request.max_tokens,
stream=request.stream
)
if request.stream:
# 流式响应(使用 StreamingResponse)
async def generate():
try:
async for chunk in result:
chunk_data = {
"model": config.name,
"choices": [{
"delta": {"content": chunk},
"finish_reason": None
}]
}
yield f"data: {json.dumps(chunk_data)}\n\n"
# 发送结束信号
yield "data: [DONE]\n\n"
except Exception as e:
# 发送错误
error_data = {"error": {"message": str(e)}}
yield f"data: {json.dumps(error_data)}\n\n"
from fastapi.responses import StreamingResponse
return StreamingResponse(
generate(),
media_type="text/event-stream"
)
else:
# 非流式响应
latency = (time.time() - start_time) * 1000
return ChatResponse(
model=config.name,
content=result["content"],
usage=result.get("usage", {}),
latency_ms=round(latency, 2)
)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="Model request timeout")
except httpx.HTTPStatusError as e:
raise HTTPException(status_code=502, detail=f"Model API error: {e.response.text}")
except Exception as e:
raise HTTPException(status_code=500, detail=f"DM Sansnal error: {str(e)}")
@router.get("/models")
async def list_models():
"""列出所有可用的模型"""
settings = get_settings()
return {
"models": [
{
"name": m.name,
"model_id": m.model_id,
"type": m.type,
"max_tokens": m.max_tokens,
"enabled": m.enabled,
"priority": m.priority
}
for m in sorted(settings.models, key=lambda x: x.priority)
if m.enabled
]
}
# app/middleware/rate_limit.py
from fastapi import Request, HTTPException
from starlette.middleware.base import BaseHTTPMiddleware
from collections import defaultdict
import time
import asyncio
class RateLimitMiddleware(BaseHTTPMiddleware):
"""
基于滑动窗口的限流中间件
使用 Redis 作为计数器存储(生产环境)
本地模式使用内存字典(开发环境)
"""
def __init__(
self,
app,
requests: int = 100,
window: int = 60,
redis_url: str = None
):
super().__init__(app)
self.requests = requests
self.window = window
self.redis_url = redis_url
# 本地模式计数器(开发/无 Redis 时使用)
self._local_counts = defaultdict(list)
# Redis 客户端(可选)
self._redis = None
if redis_url:
try:
import redis.asyncio as redis
self._redis = redis.from_url(redis_url)
except Exception:
pass
async def _check_redis(self, key: str) -> bool:
"""使用 Redis 进行限流检查"""
if not self._redis:
return await self._check_local(key)
try:
now = time.time()
window_start = now - self.window
# Lua 脚本保证原子性
lua_script = """
local key = KEYS[1]
local window = tonumber(ARGV[1])
local limit = tonumber(ARGV[2])
local now = tonumber(ARGV[3])
local window_start = now - window
-- 删除窗口外的记录
redis.call('ZREMRANGEBYSCORE', key, 0, window_start)
-- 计算当前请求数
local count = redis.call('ZCARD', key)
if count < limit then
-- 添加当前请求
redis.call('ZADD', key, now, now .. ':' .. math.random())
redis.call('EXPIRE', key, window)
return 1
else
return 0
end
"""
result = await self._redis.eval(
lua_script,
1,
f"ratelimit:{key}",
self.window,
self.requests,
now
)
return bool(result)
except Exception:
return await self._check_local(key)
async def _check_local(self, key: str) -> bool:
"""本地内存限流(降级方案)"""
now = time.time()
window_start = now - self.window
# 清理过期记录
self._local_counts[key] = [
t for t in self._local_counts[key] if t > window_start
]
if len(self._local_counts[key]) < self.requests:
self._local_counts[key].append(now)
return True
return False
async def dispatch(self, request: Request, call_next):
# 仅对 /v1/chat/completions 限流
if request.url.path.startswith("/v1/chat"):
client_ip = request.client.host
key = f"{client_ip}:{request.url.path}"
allowed = await self._check_redis(key)
if not allowed:
raise HTTPException(
status_code=429,
detail=f"Rate limit exceeded. "
f"Max {self.requests} requests per {self.window}s. "
f"Retry after {self.window}s."
)
response = await call_next(request)
return response
# app/main.py
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import logging
import time
from contextlib import asynccontextmanager
from app.config import get_settings
from app.router import router as llm_router
from app.middleware.rate_limit import RateLimitMiddleware
from app.utils.metrics import PrometheusMetrics
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# 全局指标
metrics = PrometheusMetrics()
@asynccontextmanager
async def lifespan(app: FastAPI):
"""应用生命周期管理"""
settings = get_settings()
logger.info(f"Starting {settings.app_name} v{settings.version}")
logger.info(f"Models: {[m.name for m in settings.models if m.enabled]}")
# 预热:初始化模型实例
from app.router import get_model_instance, find_model_config
for model_config in settings.models:
if model_config.enabled:
try:
model = get_model_instance(model_config)
logger.info(f"Initialized model: {model_config.name}")
except Exception as e:
logger.warning(f"Failed to init {model_config.name}: {e}")
yield # 应用运行中
# 关闭时清理
from app.router import _model_instances
for name, model in _model_instances.items():
await model.close()
logger.info("All models closed. Shutdown complete.")
# 创建 FastAPI 应用
app = FastAPI(
title="LLM Serving API",
description="Multi-model LLM serving with FastAPI",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc",
lifespan=lifespan
)
# CORS 中间件
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # 生产环境应限制
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# 限流中间件
settings = get_settings()
app.add_middleware(
RateLimitMiddleware,
requests=settings.rate_limit_requests,
window=settings.rate_limit_window,
redis_url=settings.redis_url
)
# 注册路由
app.include_router(llm_router)
# 健康检查
@app.get("/health")
async def health_check():
return {"status": "ok", "version": get_settings().version}
# 指标端点
@app.get("/metrics")
async def metrics_endpoint():
return metrics.get_all()
# 全局异常处理
@app.exception_handler(Exception)
async def global_exception_handler(request: Request, exc: Exception):
logger.error(f"Unhandled exception: {exc}", exc_info=True)
return JSONResponse(
status_code=500,
content={"error": "DM Sansnal server error", "detail": str(exc)}
)
if __name__ == "__main__":
import uvicorn
settings = get_settings()
uvicorn.run(
"app.main:app",
host=settings.host,
port=settings.port,
workers=settings.workers,
log_level=settings.log_level.lower(),
reload=settings.debug
)
# Dockerfile
FROM python:3.11-slim
# 安装系统依赖
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/*
# 设置工作目录
WORKDIR /app
# 复制依赖文件
COPY requirements.txt .
# 安装 Python 依赖
RUN pip install --no-cache-dir -r requirements.txt
# 复制应用代码
COPY app/ ./app/
COPY logs/ ./logs/
# 创建非 root 用户
RUN useradd -m -u 1000 llmuser && \
chown -R llmuser:llmuser /app
# 切换到非 root 用户
USER llmuser
# 暴露端口
EXPOSE 8000
# 健康检查
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
CMD curl -f http://localhost:8000/health || exit 1
# 启动命令
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
# requirements.txt
fastapi==0.111.0
uvicorn[standard]==0.29.0
httpx==0.27.0
pydantic==2.7.0
pydantic-settings==2.2.1
python-multipart==0.0.9
redis==5.0.4
prometheus-client==0.20.0
opentelemetry-api==1.24.0
opentelemetry-sdk==1.24.0
# docker-compose.yaml
version: '3.8'
services:
# FastAPI 应用(3副本)
llm-api:
image: registry.example.com/llm-serve:v1.0.0
deploy:
replicas: 3
resources:
limits:
cpus: '4'
memory: 8G
reservations:
cpus: '2'
memory: 4G
ports:
- "8000:8000"
environment:
- REDIS_URL=redis://redis:6379/0
- LOG_LEVEL=INFO
depends_on:
- redis
networks:
- llm-net
restart: always
# Redis(限流 + 缓存)
redis:
image: redis:7-alpine
command: redis-server --maxmemory 512mb --maxmemory-policy allkeys-lru
ports:
- "6379:6379"
volumes:
- redis-data:/data
networks:
- llm-net
restart: always
# Nginx(反向代理 + 限流)
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx.conf:/etc/nginx/nginx.conf:ro
- ./ssl/:/etc/nginx/ssl/:ro
depends_on:
- llm-api
networks:
- llm-net
restart: always
networks:
llm-net:
driver: bridge
volumes:
redis-data:
# nginx.conf
worker_processes auto;
error_log /var/log/nginx/error.log warn;
events {
worker_connections 1024;
}
http {
include /etc/nginx/mime.types;
default_type application/octet-stream;
# 日志格式
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" $request_time';
access_log /var/log/nginx/access.log main;
# 性能优化
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
gzip on;
gzip_types text/plain application/json application/javascript text/css;
# 限流配置(NGINX 层)
limit_req_zone $binary_remote_addr zone=api_limit:10m rate=50r/s;
limit_conn_zone $binary_remote_addr zone=conn_limit:10m;
upstream llm_backend {
least_conn; # 最少连接优先
server llm-api-1:8000 weight=5;
server llm-api-2:8000 weight=5;
server llm-api-3:8000 weight=5;
keepalive 32;
}
server {
listen 80;
server_name api.example.com;
# 重定向到 HTTPS
return 301 https://$server_name$request_uri;
}
server {
listen 443 ssl http2;
server_name api.example.com;
# SSL 配置
ssl_certificate /etc/nginx/ssl/example.com.crt;
ssl_certificate_key /etc/nginx/ssl/example.com.key;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256;
ssl_prefer_server_ciphers on;
ssl_session_cache shared:SSL:10m;
# 请求限制
limit_req zone=api_limit burst=100 nodelay;
limit_conn conn_limit 20;
# 代理到后端
location / {
proxy_pass http://llm_backend;
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 连接复用(Keep-Alive)
proxy_set_header Connection "";
# 超时配置
proxy_connect_timeout 60s;
proxy_send_timeout 60s;
proxy_read_timeout 120s;
# 流式响应支持
proxy_buffering off;
proxy_cache off;
}
# 健康检查(NGINX 不计入会话限制)
location /health {
proxy_pass http://llm_backend/health;
proxy_set_header Host $host;
limit_req off;
limit_conn off;
}
}
}
# 错误做法:每次请求前重新加载
# 正确做法:Ollama 服务保持运行
# 检查 Ollama 当前加载的模型
curl http://localhost:11434/api/tags
# 查看 GPU 显存使用
nvidia-smi
# 如果显存泄漏,手动卸载模型
curl -X POST http://localhost:11434/api/generate -d '{
"model": "qwen2-72b",
"keep_alive": 0 # 0 表示立即卸载
}'
# 正确配置:设置 keep_alive 保持模型常驻
# 在调用时设置较长的 keep_alive(如 24 小时)
curl -X POST http://localhost:11434/api/chat -d '{
"model": "qwen2-72b",
"options": {
"num_gpu": 1,
"keep_alive": 86400 # 24小时,单位秒
}
}'
# 原因:模型太大,显存不够
# 解决:使用量化版本或调整 tensor_parallel
# 方法1:使用 INT4 量化(显存减半)
vllm serve meta-llama/Llama-3-70b-instruct \
--dtype half \
--quantization fp8 # FP8 量化,显存降低 50%
# 方法2:使用张量并行(多 GPU)
vllm serve meta-llama/Llama-3-70b-instruct \
--tensor-parallel-size 2 # 使用 2 个 GPU
# 方法3:降低 max_model_len(上下文窗口)
vllm serve meta-llama/Llama-3-70b-instruct \
--dtype half \
--max-model-len 32768 # 限制上下文长度,节省显存
# 查看显存使用
nvidia-smi --query-gpu=memory.used,memory.total --format=csv
# 502 通常意味着 uvicorn 无法连接到后端模型服务
# 排查步骤:
# 1. 检查模型服务是否运行
curl http://localhost:11434/api/tags # Ollama
curl http://localhost:8000/health # vLLM
# 2. 检查 uvicorn 日志
# 找到容器 ID
docker ps | grep llm-api
# 查看日志
docker logs -f <container_id>
# 3. 调整超时配置
# 在请求中加入 timeout 参数
@router.post("/v1/chat/completions")
async def chat_completions(request: ChatRequest):
# 确保 timeout 设置足够大
timeout = request.timeout or 120
# 在模型调用时传递 timeout
本文涵盖了 FastAPI + LLM 生产级部署的核心要素: