Oliver Hofmann dd8f69ecb6 Proxy fixes, streaming support, Admin-UI overhaul
Backend:
- Fix Content-Length mismatch by not forwarding client headers to Ollama
- Proxy /v1/chat/completions directly to Ollama's OpenAI-compatible endpoint
  (eliminates manual Ollama↔OpenAI format conversion, fixes tool use)
- Add streaming support via SSE passthrough
- Fix ollama_url /v1 suffix stripped on save
- Replace BaseHTTPMiddleware with FastAPI global dependency (fixes double logging)
- Add rotating usage log (8 KB, logs key name + model + token estimate + prompt preview)
- Add httpx timeout 300s
- Add activate and delete endpoints for API keys
- Return usage data (tokens/requests) in GET /api/api-keys

Frontend:
- Admin table: remove ID column, status as icon, icon-only action buttons with CSS tooltips
- Add activate + delete buttons; edit available for inactive keys too
- Quota columns: fixed equal width, progress bars with k-unit formatting
- Create form: structured layout matching edit form style
- Edit form: token inputs in k units (÷1000 display, ×1000 on save)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-29 07:48:10 +02:00

134 lines
6.3 KiB
Python

import logging
import os
from logging.handlers import RotatingFileHandler
from pathlib import Path
from fastapi import FastAPI, HTTPException, Depends, Request
from fastapi.responses import JSONResponse, StreamingResponse
from sqlalchemy.orm import Session
from database import get_db
import crud
import httpx
# Rotating usage log (8 KB per file, 3 backups)
_log_path = Path(os.getenv("LOG_FILE", "logs/usage.log"))
_log_path.parent.mkdir(parents=True, exist_ok=True)
_handler = RotatingFileHandler(str(_log_path), maxBytes=8192, backupCount=3, encoding="utf-8")
_handler.setFormatter(logging.Formatter("%(asctime)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S"))
usage_log = logging.getLogger("proxy.usage")
usage_log.setLevel(logging.INFO)
usage_log.addHandler(_handler)
usage_log.propagate = False
def _last_user_msg(messages: list, max_len: int = 120) -> str:
for msg in reversed(messages):
if msg.get("role") == "user":
text = (msg.get("content") or "").replace("\n", " ").strip()
return text[:max_len] + ("" if len(text) > max_len else "")
return ""
async def require_api_key(request: Request, db: Session = Depends(get_db)):
auth_header = request.headers.get("Authorization", "")
if auth_header.startswith("Bearer "):
api_key = auth_header[7:]
elif auth_header.startswith("sk-"):
api_key = auth_header
else:
raise HTTPException(status_code=401, detail="Invalid or missing API key")
db_key = crud.verify_api_key(db, api_key)
if not db_key:
raise HTTPException(status_code=401, detail="Invalid API key")
request.state.api_key_id = db_key.id
request.state.api_key_name = db_key.name
app = FastAPI(title="Ollama Proxy", dependencies=[Depends(require_api_key)])
async def proxy_request(url: str, method: str = "GET", json_data: dict = None):
async with httpx.AsyncClient(timeout=300.0) as client:
response = await client.request(method=method, url=url, json=json_data)
return response
@app.post("/api/generate")
async def generate(request: Request, db: Session = Depends(get_db)):
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
body = await request.json()
prompt_tokens = crud.count_tokens(body.get("prompt", ""))
if not crud.check_and_increment_quota(db, request.state.api_key_id, tokens=prompt_tokens, requests=1):
raise HTTPException(status_code=429, detail="Quota exceeded")
prompt_preview = (body.get("prompt", "").replace("\n", " ").strip())[:120]
usage_log.info('%s | /api/generate | %s | ~%d tokens | "%s"',
request.state.api_key_name, body.get("model", "?"), prompt_tokens, prompt_preview)
response = await proxy_request(f"{ollama_url}/api/generate", method="POST", json_data=body)
return JSONResponse(content=response.json(), status_code=response.status_code)
@app.post("/api/chat")
async def chat(request: Request, db: Session = Depends(get_db)):
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
body = await request.json()
messages = body.get("messages", [])
prompt_tokens = sum(crud.count_tokens(msg.get("content") or "") for msg in messages)
if not crud.check_and_increment_quota(db, request.state.api_key_id, tokens=prompt_tokens, requests=1):
raise HTTPException(status_code=429, detail="Quota exceeded")
usage_log.info('%s | /api/chat | %s | ~%d tokens | "%s"',
request.state.api_key_name, body.get("model", "?"), prompt_tokens, _last_user_msg(messages))
response = await proxy_request(f"{ollama_url}/api/chat", method="POST", json_data=body)
return JSONResponse(content=response.json(), status_code=response.status_code)
@app.get("/api/tags")
async def list_models(db: Session = Depends(get_db)):
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
response = await proxy_request(f"{ollama_url}/api/tags", method="GET")
return JSONResponse(content=response.json(), status_code=response.status_code)
@app.get("/api/versions")
async def versions(db: Session = Depends(get_db)):
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
response = await proxy_request(f"{ollama_url}/api/versions", method="GET")
return JSONResponse(content=response.json(), status_code=response.status_code)
@app.get("/v1/models")
async def list_openai_models(db: Session = Depends(get_db)):
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
response = await proxy_request(f"{ollama_url}/v1/models", method="GET")
return JSONResponse(content=response.json(), status_code=response.status_code)
@app.post("/v1/chat/completions")
async def openai_chat_completions(request: Request, db: Session = Depends(get_db)):
ollama_url = crud.get_setting(db, "ollama_url", os.getenv("OLLAMA_URL", "http://localhost:11434"))
default_model = crud.get_setting(db, "default_model", os.getenv("DEFAULT_MODEL", "llama3"))
body = await request.json()
messages = body.get("messages", [])
prompt_tokens = sum(crud.count_tokens(msg.get("content") or "") for msg in messages)
if not crud.check_and_increment_quota(db, request.state.api_key_id, tokens=prompt_tokens, requests=1):
raise HTTPException(status_code=429, detail="Quota exceeded")
if "model" not in body:
body = {**body, "model": default_model}
model_name = body["model"]
usage_log.info('%s | /v1/chat/completions | %s | ~%d tokens | "%s"',
request.state.api_key_name, model_name, prompt_tokens, _last_user_msg(messages))
target = f"{ollama_url}/v1/chat/completions"
if body.get("stream"):
async def generate():
async with httpx.AsyncClient(timeout=300.0) as client:
async with client.stream("POST", target, json=body) as resp:
async for chunk in resp.aiter_bytes():
yield chunk
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
)
response = await proxy_request(target, method="POST", json_data=body)
return JSONResponse(content=response.json(), status_code=response.status_code)