Backend: - Fix Content-Length mismatch by not forwarding client headers to Ollama - Proxy /v1/chat/completions directly to Ollama's OpenAI-compatible endpoint (eliminates manual Ollama↔OpenAI format conversion, fixes tool use) - Add streaming support via SSE passthrough - Fix ollama_url /v1 suffix stripped on save - Replace BaseHTTPMiddleware with FastAPI global dependency (fixes double logging) - Add rotating usage log (8 KB, logs key name + model + token estimate + prompt preview) - Add httpx timeout 300s - Add activate and delete endpoints for API keys - Return usage data (tokens/requests) in GET /api/api-keys Frontend: - Admin table: remove ID column, status as icon, icon-only action buttons with CSS tooltips - Add activate + delete buttons; edit available for inactive keys too - Quota columns: fixed equal width, progress bars with k-unit formatting - Create form: structured layout matching edit form style - Edit form: token inputs in k units (÷1000 display, ×1000 on save) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
134 lines
6.3 KiB
Python
134 lines
6.3 KiB
Python
import logging
|
|
import os
|
|
from logging.handlers import RotatingFileHandler
|
|
from pathlib import Path
|
|
|
|
from fastapi import FastAPI, HTTPException, Depends, Request
|
|
from fastapi.responses import JSONResponse, StreamingResponse
|
|
from sqlalchemy.orm import Session
|
|
from database import get_db
|
|
import crud
|
|
import httpx
|
|
|
|
# Rotating usage log (8 KB per file, 3 backups)
|
|
_log_path = Path(os.getenv("LOG_FILE", "logs/usage.log"))
|
|
_log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
_handler = RotatingFileHandler(str(_log_path), maxBytes=8192, backupCount=3, encoding="utf-8")
|
|
_handler.setFormatter(logging.Formatter("%(asctime)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
|
|
usage_log = logging.getLogger("proxy.usage")
|
|
usage_log.setLevel(logging.INFO)
|
|
usage_log.addHandler(_handler)
|
|
usage_log.propagate = False
|
|
|
|
def _last_user_msg(messages: list, max_len: int = 120) -> str:
|
|
for msg in reversed(messages):
|
|
if msg.get("role") == "user":
|
|
text = (msg.get("content") or "").replace("\n", " ").strip()
|
|
return text[:max_len] + ("…" if len(text) > max_len else "")
|
|
return ""
|
|
|
|
async def require_api_key(request: Request, db: Session = Depends(get_db)):
|
|
auth_header = request.headers.get("Authorization", "")
|
|
if auth_header.startswith("Bearer "):
|
|
api_key = auth_header[7:]
|
|
elif auth_header.startswith("sk-"):
|
|
api_key = auth_header
|
|
else:
|
|
raise HTTPException(status_code=401, detail="Invalid or missing API key")
|
|
db_key = crud.verify_api_key(db, api_key)
|
|
if not db_key:
|
|
raise HTTPException(status_code=401, detail="Invalid API key")
|
|
request.state.api_key_id = db_key.id
|
|
request.state.api_key_name = db_key.name
|
|
|
|
app = FastAPI(title="Ollama Proxy", dependencies=[Depends(require_api_key)])
|
|
|
|
async def proxy_request(url: str, method: str = "GET", json_data: dict = None):
|
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
response = await client.request(method=method, url=url, json=json_data)
|
|
return response
|
|
|
|
@app.post("/api/generate")
|
|
async def generate(request: Request, db: Session = Depends(get_db)):
|
|
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
|
body = await request.json()
|
|
prompt_tokens = crud.count_tokens(body.get("prompt", ""))
|
|
|
|
if not crud.check_and_increment_quota(db, request.state.api_key_id, tokens=prompt_tokens, requests=1):
|
|
raise HTTPException(status_code=429, detail="Quota exceeded")
|
|
|
|
prompt_preview = (body.get("prompt", "").replace("\n", " ").strip())[:120]
|
|
usage_log.info('%s | /api/generate | %s | ~%d tokens | "%s"',
|
|
request.state.api_key_name, body.get("model", "?"), prompt_tokens, prompt_preview)
|
|
response = await proxy_request(f"{ollama_url}/api/generate", method="POST", json_data=body)
|
|
return JSONResponse(content=response.json(), status_code=response.status_code)
|
|
|
|
@app.post("/api/chat")
|
|
async def chat(request: Request, db: Session = Depends(get_db)):
|
|
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
|
body = await request.json()
|
|
messages = body.get("messages", [])
|
|
prompt_tokens = sum(crud.count_tokens(msg.get("content") or "") for msg in messages)
|
|
|
|
if not crud.check_and_increment_quota(db, request.state.api_key_id, tokens=prompt_tokens, requests=1):
|
|
raise HTTPException(status_code=429, detail="Quota exceeded")
|
|
|
|
usage_log.info('%s | /api/chat | %s | ~%d tokens | "%s"',
|
|
request.state.api_key_name, body.get("model", "?"), prompt_tokens, _last_user_msg(messages))
|
|
response = await proxy_request(f"{ollama_url}/api/chat", method="POST", json_data=body)
|
|
return JSONResponse(content=response.json(), status_code=response.status_code)
|
|
|
|
@app.get("/api/tags")
|
|
async def list_models(db: Session = Depends(get_db)):
|
|
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
|
response = await proxy_request(f"{ollama_url}/api/tags", method="GET")
|
|
return JSONResponse(content=response.json(), status_code=response.status_code)
|
|
|
|
@app.get("/api/versions")
|
|
async def versions(db: Session = Depends(get_db)):
|
|
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
|
response = await proxy_request(f"{ollama_url}/api/versions", method="GET")
|
|
return JSONResponse(content=response.json(), status_code=response.status_code)
|
|
|
|
@app.get("/v1/models")
|
|
async def list_openai_models(db: Session = Depends(get_db)):
|
|
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
|
response = await proxy_request(f"{ollama_url}/v1/models", method="GET")
|
|
return JSONResponse(content=response.json(), status_code=response.status_code)
|
|
|
|
@app.post("/v1/chat/completions")
|
|
async def openai_chat_completions(request: Request, db: Session = Depends(get_db)):
|
|
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
|
|
default_model = crud.get_setting(db, "default_model", os.getenv("DEFAULT_MODEL", "llama3"))
|
|
|
|
body = await request.json()
|
|
messages = body.get("messages", [])
|
|
prompt_tokens = sum(crud.count_tokens(msg.get("content") or "") for msg in messages)
|
|
|
|
if not crud.check_and_increment_quota(db, request.state.api_key_id, tokens=prompt_tokens, requests=1):
|
|
raise HTTPException(status_code=429, detail="Quota exceeded")
|
|
|
|
if "model" not in body:
|
|
body = {**body, "model": default_model}
|
|
|
|
model_name = body["model"]
|
|
usage_log.info('%s | /v1/chat/completions | %s | ~%d tokens | "%s"',
|
|
request.state.api_key_name, model_name, prompt_tokens, _last_user_msg(messages))
|
|
|
|
target = f"{ollama_url}/v1/chat/completions"
|
|
|
|
if body.get("stream"):
|
|
async def generate():
|
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
|
async with client.stream("POST", target, json=body) as resp:
|
|
async for chunk in resp.aiter_bytes():
|
|
yield chunk
|
|
return StreamingResponse(
|
|
generate(),
|
|
media_type="text/event-stream",
|
|
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
|
)
|
|
|
|
response = await proxy_request(target, method="POST", json_data=body)
|
|
return JSONResponse(content=response.json(), status_code=response.status_code)
|