Log actual tokens and elapsed time for all endpoints incl. streaming
For streaming /v1/chat/completions: inject stream_options.include_usage, parse usage from SSE chunks, log actual ↑↓ tokens and wall time in the generator's finally block. Add elapsed time to all second log entries.
This commit is contained in:
parent
07f6fec4bf
commit
70fd61608b
@ -1,5 +1,7 @@
|
|||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
from logging.handlers import RotatingFileHandler
|
from logging.handlers import RotatingFileHandler
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -102,12 +104,14 @@ async def generate(request: Request, db: Session = Depends(get_db)):
|
|||||||
prompt_preview = (body.get("prompt", "").replace("\n", " ").strip())[:120]
|
prompt_preview = (body.get("prompt", "").replace("\n", " ").strip())[:120]
|
||||||
usage_log.info('%s | /api/generate | %s | ~%d tokens | "%s"',
|
usage_log.info('%s | /api/generate | %s | ~%d tokens | "%s"',
|
||||||
request.state.api_key_name, body.get("model", "?"), prompt_tokens, prompt_preview)
|
request.state.api_key_name, body.get("model", "?"), prompt_tokens, prompt_preview)
|
||||||
|
start = time.monotonic()
|
||||||
try:
|
try:
|
||||||
response = await proxy_request(f"{ollama_url}/api/generate", method="POST", json_data=body)
|
response = await proxy_request(f"{ollama_url}/api/generate", method="POST", json_data=body)
|
||||||
resp_json = response.json()
|
resp_json = response.json()
|
||||||
usage_log.info('%s | /api/generate | %s | actual ↑%d ↓%d tokens',
|
usage_log.info('%s | /api/generate | %s | actual ↑%d ↓%d tokens | %.1fs',
|
||||||
request.state.api_key_name, body.get("model", "?"),
|
request.state.api_key_name, body.get("model", "?"),
|
||||||
resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0))
|
resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0),
|
||||||
|
time.monotonic() - start)
|
||||||
return JSONResponse(content=resp_json, status_code=response.status_code)
|
return JSONResponse(content=resp_json, status_code=response.status_code)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
error_log.error("Proxy error | %s | /api/generate | %s | %s: %s",
|
error_log.error("Proxy error | %s | /api/generate | %s | %s: %s",
|
||||||
@ -131,12 +135,14 @@ async def chat(request: Request, db: Session = Depends(get_db)):
|
|||||||
|
|
||||||
usage_log.info('%s | /api/chat | %s | ~%d tokens | "%s"',
|
usage_log.info('%s | /api/chat | %s | ~%d tokens | "%s"',
|
||||||
request.state.api_key_name, body.get("model", "?"), prompt_tokens, _last_user_msg(messages))
|
request.state.api_key_name, body.get("model", "?"), prompt_tokens, _last_user_msg(messages))
|
||||||
|
start = time.monotonic()
|
||||||
try:
|
try:
|
||||||
response = await proxy_request(f"{ollama_url}/api/chat", method="POST", json_data=body)
|
response = await proxy_request(f"{ollama_url}/api/chat", method="POST", json_data=body)
|
||||||
resp_json = response.json()
|
resp_json = response.json()
|
||||||
usage_log.info('%s | /api/chat | %s | actual ↑%d ↓%d tokens',
|
usage_log.info('%s | /api/chat | %s | actual ↑%d ↓%d tokens | %.1fs',
|
||||||
request.state.api_key_name, body.get("model", "?"),
|
request.state.api_key_name, body.get("model", "?"),
|
||||||
resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0))
|
resp_json.get("prompt_eval_count", 0), resp_json.get("eval_count", 0),
|
||||||
|
time.monotonic() - start)
|
||||||
return JSONResponse(content=resp_json, status_code=response.status_code)
|
return JSONResponse(content=resp_json, status_code=response.status_code)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
error_log.error("Proxy error | %s | /api/chat | %s | %s: %s",
|
error_log.error("Proxy error | %s | /api/chat | %s | %s: %s",
|
||||||
@ -195,29 +201,51 @@ async def openai_chat_completions(request: Request, db: Session = Depends(get_db
|
|||||||
target = f"{ollama_url}/v1/chat/completions"
|
target = f"{ollama_url}/v1/chat/completions"
|
||||||
|
|
||||||
if body.get("stream"):
|
if body.get("stream"):
|
||||||
|
existing_opts = body.get("stream_options") or {}
|
||||||
|
stream_body = {**body, "stream_options": {**existing_opts, "include_usage": True}}
|
||||||
|
start = time.monotonic()
|
||||||
|
usage_tokens = {"prompt": 0, "completion": 0}
|
||||||
|
|
||||||
async def generate():
|
async def generate():
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=300.0) as client:
|
async with httpx.AsyncClient(timeout=300.0) as client:
|
||||||
async with client.stream("POST", target, json=body) as resp:
|
async with client.stream("POST", target, json=stream_body) as resp:
|
||||||
async for chunk in resp.aiter_bytes():
|
async for chunk in resp.aiter_bytes():
|
||||||
|
try:
|
||||||
|
for line in chunk.decode("utf-8", errors="ignore").splitlines():
|
||||||
|
if line.startswith("data: ") and "[DONE]" not in line:
|
||||||
|
data = json.loads(line[6:])
|
||||||
|
if u := data.get("usage"):
|
||||||
|
usage_tokens["prompt"] = u.get("prompt_tokens", 0)
|
||||||
|
usage_tokens["completion"] = u.get("completion_tokens", 0)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
yield chunk
|
yield chunk
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
error_log.error("Stream error | %s | /v1/chat/completions | %s | %s: %s",
|
error_log.error("Stream error | %s | /v1/chat/completions | %s | %s: %s",
|
||||||
request.state.api_key_name, model_name, type(exc).__name__, exc, exc_info=exc)
|
request.state.api_key_name, model_name, type(exc).__name__, exc, exc_info=exc)
|
||||||
raise
|
raise
|
||||||
|
finally:
|
||||||
|
usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens | %.1fs',
|
||||||
|
request.state.api_key_name, model_name,
|
||||||
|
usage_tokens["prompt"], usage_tokens["completion"],
|
||||||
|
time.monotonic() - start)
|
||||||
|
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
generate(),
|
generate(),
|
||||||
media_type="text/event-stream",
|
media_type="text/event-stream",
|
||||||
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
start = time.monotonic()
|
||||||
try:
|
try:
|
||||||
response = await proxy_request(target, method="POST", json_data=body)
|
response = await proxy_request(target, method="POST", json_data=body)
|
||||||
resp_json = response.json()
|
resp_json = response.json()
|
||||||
usage = resp_json.get("usage", {})
|
usage = resp_json.get("usage", {})
|
||||||
usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens',
|
usage_log.info('%s | /v1/chat/completions | %s | actual ↑%d ↓%d tokens | %.1fs',
|
||||||
request.state.api_key_name, model_name,
|
request.state.api_key_name, model_name,
|
||||||
usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0))
|
usage.get("prompt_tokens", 0), usage.get("completion_tokens", 0),
|
||||||
|
time.monotonic() - start)
|
||||||
return JSONResponse(content=resp_json, status_code=response.status_code)
|
return JSONResponse(content=resp_json, status_code=response.status_code)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
error_log.error("Proxy error | %s | /v1/chat/completions | %s | %s: %s",
|
error_log.error("Proxy error | %s | /v1/chat/completions | %s | %s: %s",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user