diff --git a/backend/main.py b/backend/main.py index 3fe71d1..8be2865 100644 --- a/backend/main.py +++ b/backend/main.py @@ -1,5 +1,7 @@ +import json import logging import os +import time from logging.handlers import RotatingFileHandler from pathlib import Path @@ -102,12 +104,14 @@ async def generate(request: Request, db: Session = Depends(get_db)): prompt_preview = (body.get("prompt", "").replace("\n", " ").strip())[:120] usage_log.info('%s | /api/generate | %s | ~%d tokens | "%s"', request.state.api_key_name, body.get("model", "?"), prompt_tokens, prompt_preview) + start = time.monotonic() try: response = await proxy_request(f"{ollama_url}/api/generate", method="POST", json_data=body) resp_json = response.json() - usage_log.info('%s | /api/generate | %s | actual ↑%d ↓%d tokens', + usage_log.info('%s | /api/generate | %s | actual ↑%d ↓%d tokens | %.1fs', request.state.api_key_name, body.get("model", "?"), - resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0)) + resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0), + time.monotonic() - start) return JSONResponse(content=resp_json, status_code=response.status_code) except Exception as exc: error_log.error("Proxy error | %s | /api/generate | %s | %s: %s", @@ -131,12 +135,14 @@ async def chat(request: Request, db: Session = Depends(get_db)): usage_log.info('%s | /api/chat | %s | ~%d tokens | "%s"', request.state.api_key_name, body.get("model", "?"), prompt_tokens, _last_user_msg(messages)) + start = time.monotonic() try: response = await proxy_request(f"{ollama_url}/api/chat", method="POST", json_data=body) resp_json = response.json() - usage_log.info('%s | /api/chat | %s | actual ↑%d ↓%d tokens', + usage_log.info('%s | /api/chat | %s | actual ↑%d ↓%d tokens | %.1fs', request.state.api_key_name, body.get("model", "?"), - resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0)) + resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0), + time.monotonic() - start) return JSONResponse(content=resp_json, status_code=response.status_code) except Exception as exc: error_log.error("Proxy error | %s | /api/chat | %s | %s: %s", @@ -195,29 +201,51 @@ async def openai_chat_completions(request: Request, db: Session = Depends(get_db target = f"{ollama_url}/v1/chat/completions" if body.get("stream"): + existing_opts = body.get("stream_options") or {} + stream_body = {**body, "stream_options": {**existing_opts, "include_usage": True}} + start = time.monotonic() + usage_tokens = {"prompt": 0, "completion": 0} + async def generate(): try: async with httpx.AsyncClient(timeout=300.0) as client: - async with client.stream("POST", target, json=body) as resp: + async with client.stream("POST", target, json=stream_body) as resp: async for chunk in resp.aiter_bytes(): + try: + for line in chunk.decode("utf-8", errors="ignore").splitlines(): + if line.startswith("data: ") and "[DONE]" not in line: + data = json.loads(line[6:]) + if u := data.get("usage"): + usage_tokens["prompt"] = u.get("prompt_tokens", 0) + usage_tokens["completion"] = u.get("completion_tokens", 0) + except Exception: + pass yield chunk except Exception as exc: error_log.error("Stream error | %s | /v1/chat/completions | %s | %s: %s", request.state.api_key_name, model_name, type(exc).__name__, exc, exc_info=exc) raise + finally: + usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens | %.1fs', + request.state.api_key_name, model_name, + usage_tokens["prompt"], usage_tokens["completion"], + time.monotonic() - start) + return StreamingResponse( generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, ) + start = time.monotonic() try: response = await proxy_request(target, method="POST", json_data=body) resp_json = response.json() usage = resp_json.get("usage", {}) - usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens', + usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens | %.1fs', request.state.api_key_name, model_name, - usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0)) + usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0), + time.monotonic() - start) return JSONResponse(content=resp_json, status_code=response.status_code) except Exception as exc: error_log.error("Proxy error | %s | /v1/chat/completions | %s | %s: %s",