Log actual tokens and elapsed time for all endpoints incl. streaming

For streaming /v1/chat/completions: inject stream_options.include_usage,
parse usage from SSE chunks, log actual ↑↓ tokens and wall time in the
generator's finally block. Add elapsed time to all second log entries.
This commit is contained in:
Oliver Hofmann 2026-05-08 09:47:32 +02:00
parent 07f6fec4bf
commit 70fd61608b

View File

@ -1,5 +1,7 @@
import json
import logging
import os
import time
from logging.handlers import RotatingFileHandler
from pathlib import Path
@ -102,12 +104,14 @@ async def generate(request: Request, db: Session = Depends(get_db)):
prompt_preview = (body.get("prompt", "").replace("\n", " ").strip())[:120]
usage_log.info('%s | /api/generate | %s | ~%d tokens | "%s"',
request.state.api_key_name, body.get("model", "?"), prompt_tokens, prompt_preview)
start = time.monotonic()
try:
response = await proxy_request(f"{ollama_url}/api/generate", method="POST", json_data=body)
resp_json = response.json()
usage_log.info('%s | /api/generate | %s | actual ↑%d%d tokens',
usage_log.info('%s | /api/generate | %s | actual ↑%d%d tokens | %.1fs',
request.state.api_key_name, body.get("model", "?"),
resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0))
resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0),
time.monotonic() - start)
return JSONResponse(content=resp_json, status_code=response.status_code)
except Exception as exc:
error_log.error("Proxy error | %s | /api/generate | %s | %s: %s",
@ -131,12 +135,14 @@ async def chat(request: Request, db: Session = Depends(get_db)):
usage_log.info('%s | /api/chat | %s | ~%d tokens | "%s"',
request.state.api_key_name, body.get("model", "?"), prompt_tokens, _last_user_msg(messages))
start = time.monotonic()
try:
response = await proxy_request(f"{ollama_url}/api/chat", method="POST", json_data=body)
resp_json = response.json()
usage_log.info('%s | /api/chat | %s | actual ↑%d%d tokens',
usage_log.info('%s | /api/chat | %s | actual ↑%d%d tokens | %.1fs',
request.state.api_key_name, body.get("model", "?"),
resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0))
resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0),
time.monotonic() - start)
return JSONResponse(content=resp_json, status_code=response.status_code)
except Exception as exc:
error_log.error("Proxy error | %s | /api/chat | %s | %s: %s",
@ -195,29 +201,51 @@ async def openai_chat_completions(request: Request, db: Session = Depends(get_db
target = f"{ollama_url}/v1/chat/completions"
if body.get("stream"):
existing_opts = body.get("stream_options") or {}
stream_body = {**body, "stream_options": {**existing_opts, "include_usage": True}}
start = time.monotonic()
usage_tokens = {"prompt": 0, "completion": 0}
async def generate():
try:
async with httpx.AsyncClient(timeout=300.0) as client:
async with client.stream("POST", target, json=body) as resp:
async with client.stream("POST", target, json=stream_body) as resp:
async for chunk in resp.aiter_bytes():
try:
for line in chunk.decode("utf-8", errors="ignore").splitlines():
if line.startswith("data: ") and "[DONE]" not in line:
data = json.loads(line[6:])
if u := data.get("usage"):
usage_tokens["prompt"] = u.get("prompt_tokens", 0)
usage_tokens["completion"] = u.get("completion_tokens", 0)
except Exception:
pass
yield chunk
except Exception as exc:
error_log.error("Stream error | %s | /v1/chat/completions | %s | %s: %s",
request.state.api_key_name, model_name, type(exc).__name__, exc, exc_info=exc)
raise
finally:
usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d%d tokens | %.1fs',
request.state.api_key_name, model_name,
usage_tokens["prompt"], usage_tokens["completion"],
time.monotonic() - start)
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
)
start = time.monotonic()
try:
response = await proxy_request(target, method="POST", json_data=body)
resp_json = response.json()
usage = resp_json.get("usage", {})
usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d%d tokens',
usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d%d tokens | %.1fs',
request.state.api_key_name, model_name,
usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0),
time.monotonic() - start)
return JSONResponse(content=resp_json, status_code=response.status_code)
except Exception as exc:
error_log.error("Proxy error | %s | /v1/chat/completions | %s | %s: %s",