llmproxy/backend/main.py
Oliver Hofmann 562f6ecd9c Init
2026-04-27 18:54:27 +02:00

149 lines
5.7 KiB
Python

from fastapi import FastAPI, HTTPException, Depends, Request
from fastapi.responses import JSONResponse
from sqlalchemy.orm import Session
from database import get_db
import crud
import httpx
import os
app = FastAPI(title="Ollama Proxy")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://localhost:11434")
async def proxy_request(url: str, method: str = "GET", json_data: dict = None, headers: dict = None):
async with httpx.AsyncClient() as client:
response = await client.request(method=method, url=url, json=json_data, headers=headers)
return response
@app.middleware("http")
async def authenticate_and_quota(request: Request, call_next):
auth_header = request.headers.get("Authorization", "")
if auth_header.startswith("Bearer "):
api_key = auth_header.replace("Bearer ", "")
elif auth_header.startswith("sk-"):
api_key = auth_header
else:
raise HTTPException(status_code=401, detail="Invalid or missing API key")
db = next(get_db())
db_key = crud.verify_api_key(db, api_key)
if not db_key:
raise HTTPException(status_code=401, detail="Invalid API key")
if not db_key.is_active:
raise HTTPException(status_code=403, detail="API key deactivated")
request.state.user_id = db_key.user_id
response = await call_next(request)
return response
@app.post("/api/generate")
async def generate(request: Request):
db = next(get_db())
user_id = request.state.user_id
body = await request.json()
prompt_tokens = len(body.get("prompt", "").split())
if not crud.check_quota(db, user_id, tokens=prompt_tokens, requests=1):
raise HTTPException(status_code=429, detail="Quota exceeded")
response = await proxy_request(f"{OLLAMA_URL}/api/generate", method="POST", json_data=body, headers=dict(request.headers))
crud.increment_usage(db, user_id, tokens=prompt_tokens, requests=1)
return JSONResponse(content=response.json(), status_code=response.status_code, headers=dict(response.headers))
@app.post("/api/chat")
async def chat(request: Request):
db = next(get_db())
user_id = request.state.user_id
body = await request.json()
prompt_tokens = sum(len(msg.get("content", "").split()) for msg in body.get("messages", []))
if not crud.check_quota(db, user_id, tokens=prompt_tokens, requests=1):
raise HTTPException(status_code=429, detail="Quota exceeded")
response = await proxy_request(f"{OLLAMA_URL}/api/chat", method="POST", json_data=body, headers=dict(request.headers))
crud.increment_usage(db, user_id, tokens=prompt_tokens, requests=1)
return JSONResponse(content=response.json(), status_code=response.status_code, headers=dict(response.headers))
@app.get("/api/tags")
async def list_models(request: Request):
response = await proxy_request(f"{OLLAMA_URL}/api/tags", method="GET", headers=dict(request.headers))
return JSONResponse(content=response.json(), status_code=response.status_code, headers=dict(response.headers))
@app.get("/api/versions")
async def versions(request: Request):
response = await proxy_request(f"{OLLAMA_URL}/api/versions", method="GET", headers=dict(request.headers))
return JSONResponse(content=response.json(), status_code=response.status_code, headers=dict(response.headers))
@app.get("/v1/models")
async def list_openai_models(request: Request):
response = await proxy_request(f"{OLLAMA_URL}/api/tags", method="GET", headers=dict(request.headers))
ollama_models = response.json()
openai_models = {
"object": "list",
"data": [
{
"id": model["name"],
"object": "model",
"created": int(model["modified_at"][:10].replace("-", "")) * 1000 if "modified_at" in model else 0,
"owned_by": "ollama"
}
for model in ollama_models.get("models", [])
]
}
return JSONResponse(content=openai_models, status_code=200, headers=dict(response.headers))
@app.post("/v1/chat/completions")
async def openai_chat_completions(request: Request):
db = next(get_db())
user_id = request.state.user_id
body = await request.json()
messages = body.get("messages", [])
prompt_tokens = sum(len(msg.get("content", "").split()) for msg in messages)
if not crud.check_quota(db, user_id, tokens=prompt_tokens, requests=1):
raise HTTPException(status_code=429, detail="Quota exceeded")
ollama_body = {
"model": body.get("model", "llama3"),
"messages": messages,
"stream": body.get("stream", False)
}
response = await proxy_request(f"{OLLAMA_URL}/api/chat", method="POST", json_data=ollama_body, headers=dict(request.headers))
crud.increment_usage(db, user_id, tokens=prompt_tokens, requests=1)
openai_response = {
"id": f"chatcmpl-{hash(msg.get('content', ''))}",
"object": "chat.completion",
"created": int(__import__('time').time()),
"model": body.get("model", "llama3"),
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": response.json().get("message", {}).get("content", "")
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": prompt_tokens,
"completion_tokens": len(response.json().get("message", {}).get("content", "").split()),
"total_tokens": prompt_tokens + len(response.json().get("message", {}).get("content", "").split())
}
}
return JSONResponse(content=openai_response, status_code=200, headers={"Content-Type": "application/json"})