Log actual tokens and elapsed time for all endpoints incl. streaming

For streaming /v1/chat/completions: inject stream_options.include_usage, parse usage from SSE chunks, log actual ↑↓ tokens and wall time in the generator's finally block. Add elapsed time to all second log entries.
2026-05-08 09:47:32 +02:00 · 2026-05-08 09:47:32 +02:00 · 70fd61608b
commit 70fd61608b
parent 07f6fec4bf
1 changed files with 35 additions and 7 deletions
--- a/backend/main.py
+++ b/backend/main.py
@ -1,5 +1,7 @@
+import json
 import logging
 import os
+import time
 from logging.handlers import RotatingFileHandler
 from pathlib import Path

@ -102,12 +104,14 @@ async def generate(request: Request, db: Session = Depends(get_db)):
    prompt_preview = (body.get("prompt", "").replace("\n", " ").strip())[:120]
    usage_log.info('%s | /api/generate | %s | ~%d tokens | "%s"',
                   request.state.api_key_name, body.get("model", "?"), prompt_tokens, prompt_preview)
+    start = time.monotonic()
    try:
        response = await proxy_request(f"{ollama_url}/api/generate", method="POST", json_data=body)
        resp_json = response.json()
-        usage_log.info('%s | /api/generate | %s | actual ↑%d ↓%d tokens',
+        usage_log.info('%s | /api/generate | %s | actual ↑%d ↓%d tokens | %.1fs',
                       request.state.api_key_name, body.get("model", "?"),
-                       resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0))
+                       resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0),
+                       time.monotonic() - start)
        return JSONResponse(content=resp_json, status_code=response.status_code)
    except Exception as exc:
        error_log.error("Proxy error | %s | /api/generate | %s | %s: %s",
@ -131,12 +135,14 @@ async def chat(request: Request, db: Session = Depends(get_db)):

    usage_log.info('%s | /api/chat | %s | ~%d tokens | "%s"',
                   request.state.api_key_name, body.get("model", "?"), prompt_tokens, _last_user_msg(messages))
+    start = time.monotonic()
    try:
        response = await proxy_request(f"{ollama_url}/api/chat", method="POST", json_data=body)
        resp_json = response.json()
-        usage_log.info('%s | /api/chat | %s | actual ↑%d ↓%d tokens',
+        usage_log.info('%s | /api/chat | %s | actual ↑%d ↓%d tokens | %.1fs',
                       request.state.api_key_name, body.get("model", "?"),
-                       resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0))
+                       resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0),
+                       time.monotonic() - start)
        return JSONResponse(content=resp_json, status_code=response.status_code)
    except Exception as exc:
        error_log.error("Proxy error | %s | /api/chat | %s | %s: %s",
@ -195,29 +201,51 @@ async def openai_chat_completions(request: Request, db: Session = Depends(get_db
    target = f"{ollama_url}/v1/chat/completions"

    if body.get("stream"):
+        existing_opts = body.get("stream_options") or {}
+        stream_body = {**body, "stream_options": {**existing_opts, "include_usage": True}}
+        start = time.monotonic()
+        usage_tokens = {"prompt": 0, "completion": 0}
+
        async def generate():
            try:
                async with httpx.AsyncClient(timeout=300.0) as client:
-                    async with client.stream("POST", target, json=body) as resp:
+                    async with client.stream("POST", target, json=stream_body) as resp:
                        async for chunk in resp.aiter_bytes():
+                            try:
+                                for line in chunk.decode("utf-8", errors="ignore").splitlines():
+                                    if line.startswith("data: ") and "[DONE]" not in line:
+                                        data = json.loads(line[6:])
+                                        if u := data.get("usage"):
+                                            usage_tokens["prompt"] = u.get("prompt_tokens", 0)
+                                            usage_tokens["completion"] = u.get("completion_tokens", 0)
+                            except Exception:
+                                pass
                            yield chunk
            except Exception as exc:
                error_log.error("Stream error | %s | /v1/chat/completions | %s | %s: %s",
                                request.state.api_key_name, model_name, type(exc).__name__, exc, exc_info=exc)
                raise
+            finally:
+                usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens | %.1fs',
+                               request.state.api_key_name, model_name,
+                               usage_tokens["prompt"], usage_tokens["completion"],
+                               time.monotonic() - start)
+
        return StreamingResponse(
            generate(),
            media_type="text/event-stream",
            headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
        )

+    start = time.monotonic()
    try:
        response = await proxy_request(target, method="POST", json_data=body)
        resp_json = response.json()
        usage = resp_json.get("usage", {})
-        usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens',
+        usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens | %.1fs',
                       request.state.api_key_name, model_name,
-                       usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
+                       usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0),
+                       time.monotonic() - start)
        return JSONResponse(content=resp_json, status_code=response.status_code)
    except Exception as exc:
        error_log.error("Proxy error | %s | /v1/chat/completions | %s | %s: %s",