From c33d155669cfbd38f704640877bbd10e3831acdd Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Thu, 19 Mar 2026 16:44:27 +0530 Subject: [PATCH 1/4] syncing traces with missing score as well --- api-contract-changes.md | 83 +++++++++++++++++++ backend/app/crud/evaluations/langfuse.py | 23 +---- .../app/services/evaluations/evaluation.py | 9 +- 3 files changed, 92 insertions(+), 23 deletions(-) create mode 100644 api-contract-changes.md diff --git a/api-contract-changes.md b/api-contract-changes.md new file mode 100644 index 000000000..b16958bd9 --- /dev/null +++ b/api-contract-changes.md @@ -0,0 +1,83 @@ +# API Contract Changes — v0.8.0 (v0.7.0 → v0.8.0) + +## New Endpoints + +| Endpoint | Method | Description | +|---|---|---| +| `/llm/chain` | POST | Sequential chain execution of LLM calls | +| `/evaluations/stt/files` | GET | List audio files | +| `/evaluations/stt/files/{file_id}` | GET | Get single audio file | +| `/evaluations/stt/samples/{sample_id}` | PATCH | Update STT sample (language, ground truth) | +| `/evaluations/tts/datasets` | POST | Create TTS evaluation dataset | +| `/evaluations/tts/datasets` | GET | List TTS datasets | +| `/evaluations/tts/datasets/{dataset_id}` | GET | Get single TTS dataset | +| `/evaluations/tts/runs` | POST | Start a TTS evaluation run | +| `/evaluations/tts/runs` | GET | List TTS evaluation runs | +| `/evaluations/tts/runs/{run_id}` | GET | Get TTS run with results | +| `/evaluations/tts/results/{result_id}` | PATCH | Update human feedback on TTS result | +| `/evaluations/tts/results/{result_id}` | GET | Get a TTS result | + +## Breaking Changes + +### Validation Error Response Format + +The 422 response body now returns a structured `errors` array instead of a concatenated string. `APIResponse` has a new field: + +``` +errors: Optional[list[ValidationErrorDetail]] # each item: { field, message } +``` + +**Clients parsing validation error responses will need to update.** + +### LLM Request Model Changes + +| Field | Old | New | +|---|---|---| +| `KaapiCompletionConfig.provider` | `Literal["openai", "google"]` | `Literal["openai", "google", "sarvamai", "elevenlabs"] \| None` (optional, auto-resolves) | +| `NativeCompletionConfig.provider` | `Literal["openai-native", "google-native"]` | `Literal["openai-native", "google-native", "sarvamai-native", "elevenlabs-native"]` | +| `QueryInput` union | `TextInput \| AudioInput` | `TextInput \| AudioInput \| ImageInput \| PDFInput` | +| `QueryParams.input` | `str \| QueryInput` | `str \| QueryInput \| list[QueryInput]` (list enables mixing modalities) | +| `LlmCall.input_type` | `Literal["text", "audio", "image"]` | `Literal["text", "audio", "image", "pdf", "multimodal"]` | + +## Default Value Changes + +| Field | Old Default | New Default | +|---|---|---| +| `TextLLMParams.temperature` | `None` | `0.1` | +| `STTLLMParams.model` | required | `"gemini-2.5-pro"` | +| `STTLLMParams.instructions` | required `str` | `None` (optional) | +| `STTLLMParams.input_language` | `None` | `"auto"` | +| `STTLLMParams.temperature` min | `0.0` | `0.01` | +| `TTSLLMParams.model` | required | `"gemini-2.5-flash-preview-tts"` | +| `TTSLLMParams.voice` | required | `"Kore"` | +| `TTSLLMParams.language` | required `str` | `None` (optional) | +| `CollectionOptions.batch_size` | `1` | `10` | + +## Other Contract Changes + +- **`STTFeedbackUpdate`**: `is_correct` and `comment` changed from required → optional +- **`ConfigBlob`**: added optional `prompt_template` field for chain `{{input}}` interpolation +- **`DatasetUploadResponse`**: added `description` and `signed_url` fields +- **`FilePublic`**: added `signed_url` field +- **`STTSamplePublic`**: added `signed_url` field +- **`STTSampleCreate`**: added optional `language_id` field (per-sample language override) +- **Signed URL support**: added `include_signed_url` query param to GET endpoints for datasets, STT datasets/runs, and TTS endpoints +- **New enums**: `BatchJobType.LLM_CHAIN`, `JobType.LLM_CHAIN`, `ChainStatus` + +## New Providers + +- **Sarvam AI** (`sarvamai`) — requires `api_key` +- **ElevenLabs** (`elevenlabs`) — requires `api_key` + +## Database Migrations + +- **048**: `llm_chain` table + `chain_id` FK on `llm_call` + `LLM_CHAIN` job type enum +- **049**: `tts_result` table with indexes (`ix_tts_result_run_id`, `idx_tts_result_feedback`, `idx_tts_result_status`) + +## Key Architectural Additions + +1. **Multimodal inputs** — image & PDF support in `/llm/call` +2. **LLM Chain** — sequential multi-block execution with intermediate callbacks +3. **TTS Evaluation** — complete evaluation suite parallel to STT evaluation +4. **ElevenLabs + Sarvam** — two new STT/TTS providers +5. **Smart defaults** — auto provider/model selection for STT/TTS diff --git a/backend/app/crud/evaluations/langfuse.py b/backend/app/crud/evaluations/langfuse.py index 1dd0c519e..139aeca44 100644 --- a/backend/app/crud/evaluations/langfuse.py +++ b/backend/app/crud/evaluations/langfuse.py @@ -492,28 +492,9 @@ def fetch_trace_scores_from_langfuse( ) continue - # 4. Identify complete scores (all traces must have the score) - total_traces = len(traces) - complete_score_names = { - name - for name, data in score_aggregations.items() - if len(data["values"]) == total_traces - } - - # 5. Filter trace scores to only include complete scores - for trace in traces: - trace["scores"] = [ - score - for score in trace["scores"] - if score["name"] in complete_score_names - ] - - # 6. Calculate summary scores (only for complete scores) + # 4. Calculate summary scores for all scores that have at least one value summary_scores = [] for score_name, agg_data in score_aggregations.items(): - if score_name not in complete_score_names: - continue - data_type = agg_data["data_type"] values = agg_data["values"] @@ -552,7 +533,7 @@ def fetch_trace_scores_from_langfuse( logger.info( f"[fetch_trace_scores_from_langfuse] Successfully fetched scores | " - f"total_traces={len(traces)} | complete_scores={list(complete_score_names)}" + f"total_traces={len(traces)} | score_names={list(score_aggregations.keys())}" ) return result diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 594122cb4..3218445c1 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -239,7 +239,10 @@ def get_evaluation_with_scores( if not get_trace_info: return eval_run, None - # Check if we already have cached traces + # Caching strategy: On the first request, trace scores are fetched from Langfuse + # and a copy is stored in S3 (and DB as fallback). Subsequent requests serve from + # S3/DB instead of Langfuse, which is significantly faster. Use resync_score=true + # to bypass the cache and re-fetch from Langfuse (e.g., when new evaluators have run). has_cached_traces_s3 = eval_run.score_trace_url is not None has_cached_traces_db = eval_run.score is not None and "traces" in eval_run.score if not resync_score: @@ -276,6 +279,7 @@ def get_evaluation_with_scores( ) return eval_run, None + # Cache miss or resync requested — fetch fresh scores from Langfuse langfuse = get_langfuse_client( session=session, org_id=organization_id, @@ -310,6 +314,7 @@ def get_evaluation_with_scores( ) return eval_run, f"Failed to fetch trace info from Langfuse: {str(e)}" + print(langfuse_score) # Merge summary_scores: existing scores + new scores from Langfuse # Create a map of existing scores by name existing_scores_map = {s["name"]: s for s in existing_summary_scores} @@ -321,7 +326,7 @@ def get_evaluation_with_scores( merged_summary_scores = list(existing_scores_map.values()) - # Build final score with merged summary_scores and traces + # Build final score and persist to S3/DB for future cached reads score = { "summary_scores": merged_summary_scores, "traces": langfuse_score.get("traces", []), From edd2d27810e219aca1336e2fee2b724a883224ed Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Fri, 20 Mar 2026 15:47:53 +0530 Subject: [PATCH 2/4] cleanup --- api-contract-changes.md | 83 ----------------------------------------- 1 file changed, 83 deletions(-) delete mode 100644 api-contract-changes.md diff --git a/api-contract-changes.md b/api-contract-changes.md deleted file mode 100644 index b16958bd9..000000000 --- a/api-contract-changes.md +++ /dev/null @@ -1,83 +0,0 @@ -# API Contract Changes — v0.8.0 (v0.7.0 → v0.8.0) - -## New Endpoints - -| Endpoint | Method | Description | -|---|---|---| -| `/llm/chain` | POST | Sequential chain execution of LLM calls | -| `/evaluations/stt/files` | GET | List audio files | -| `/evaluations/stt/files/{file_id}` | GET | Get single audio file | -| `/evaluations/stt/samples/{sample_id}` | PATCH | Update STT sample (language, ground truth) | -| `/evaluations/tts/datasets` | POST | Create TTS evaluation dataset | -| `/evaluations/tts/datasets` | GET | List TTS datasets | -| `/evaluations/tts/datasets/{dataset_id}` | GET | Get single TTS dataset | -| `/evaluations/tts/runs` | POST | Start a TTS evaluation run | -| `/evaluations/tts/runs` | GET | List TTS evaluation runs | -| `/evaluations/tts/runs/{run_id}` | GET | Get TTS run with results | -| `/evaluations/tts/results/{result_id}` | PATCH | Update human feedback on TTS result | -| `/evaluations/tts/results/{result_id}` | GET | Get a TTS result | - -## Breaking Changes - -### Validation Error Response Format - -The 422 response body now returns a structured `errors` array instead of a concatenated string. `APIResponse` has a new field: - -``` -errors: Optional[list[ValidationErrorDetail]] # each item: { field, message } -``` - -**Clients parsing validation error responses will need to update.** - -### LLM Request Model Changes - -| Field | Old | New | -|---|---|---| -| `KaapiCompletionConfig.provider` | `Literal["openai", "google"]` | `Literal["openai", "google", "sarvamai", "elevenlabs"] \| None` (optional, auto-resolves) | -| `NativeCompletionConfig.provider` | `Literal["openai-native", "google-native"]` | `Literal["openai-native", "google-native", "sarvamai-native", "elevenlabs-native"]` | -| `QueryInput` union | `TextInput \| AudioInput` | `TextInput \| AudioInput \| ImageInput \| PDFInput` | -| `QueryParams.input` | `str \| QueryInput` | `str \| QueryInput \| list[QueryInput]` (list enables mixing modalities) | -| `LlmCall.input_type` | `Literal["text", "audio", "image"]` | `Literal["text", "audio", "image", "pdf", "multimodal"]` | - -## Default Value Changes - -| Field | Old Default | New Default | -|---|---|---| -| `TextLLMParams.temperature` | `None` | `0.1` | -| `STTLLMParams.model` | required | `"gemini-2.5-pro"` | -| `STTLLMParams.instructions` | required `str` | `None` (optional) | -| `STTLLMParams.input_language` | `None` | `"auto"` | -| `STTLLMParams.temperature` min | `0.0` | `0.01` | -| `TTSLLMParams.model` | required | `"gemini-2.5-flash-preview-tts"` | -| `TTSLLMParams.voice` | required | `"Kore"` | -| `TTSLLMParams.language` | required `str` | `None` (optional) | -| `CollectionOptions.batch_size` | `1` | `10` | - -## Other Contract Changes - -- **`STTFeedbackUpdate`**: `is_correct` and `comment` changed from required → optional -- **`ConfigBlob`**: added optional `prompt_template` field for chain `{{input}}` interpolation -- **`DatasetUploadResponse`**: added `description` and `signed_url` fields -- **`FilePublic`**: added `signed_url` field -- **`STTSamplePublic`**: added `signed_url` field -- **`STTSampleCreate`**: added optional `language_id` field (per-sample language override) -- **Signed URL support**: added `include_signed_url` query param to GET endpoints for datasets, STT datasets/runs, and TTS endpoints -- **New enums**: `BatchJobType.LLM_CHAIN`, `JobType.LLM_CHAIN`, `ChainStatus` - -## New Providers - -- **Sarvam AI** (`sarvamai`) — requires `api_key` -- **ElevenLabs** (`elevenlabs`) — requires `api_key` - -## Database Migrations - -- **048**: `llm_chain` table + `chain_id` FK on `llm_call` + `LLM_CHAIN` job type enum -- **049**: `tts_result` table with indexes (`ix_tts_result_run_id`, `idx_tts_result_feedback`, `idx_tts_result_status`) - -## Key Architectural Additions - -1. **Multimodal inputs** — image & PDF support in `/llm/call` -2. **LLM Chain** — sequential multi-block execution with intermediate callbacks -3. **TTS Evaluation** — complete evaluation suite parallel to STT evaluation -4. **ElevenLabs + Sarvam** — two new STT/TTS providers -5. **Smart defaults** — auto provider/model selection for STT/TTS From 2c82e477b22c96e9f8ab4a358003f20b2662b98f Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Fri, 20 Mar 2026 15:51:34 +0530 Subject: [PATCH 3/4] cleanup --- backend/app/services/evaluations/evaluation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 3218445c1..b115f109a 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -279,7 +279,7 @@ def get_evaluation_with_scores( ) return eval_run, None - # Cache miss or resync requested — fetch fresh scores from Langfuse + # Resync requested — fetch fresh scores from Langfuse langfuse = get_langfuse_client( session=session, org_id=organization_id, @@ -314,7 +314,6 @@ def get_evaluation_with_scores( ) return eval_run, f"Failed to fetch trace info from Langfuse: {str(e)}" - print(langfuse_score) # Merge summary_scores: existing scores + new scores from Langfuse # Create a map of existing scores by name existing_scores_map = {s["name"]: s for s in existing_summary_scores} From ff7e77cc1230764f82302373ea7331527755fe2d Mon Sep 17 00:00:00 2001 From: AkhileshNegi Date: Fri, 20 Mar 2026 15:53:01 +0530 Subject: [PATCH 4/4] cleanup --- backend/app/services/evaluations/evaluation.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index b115f109a..55653b253 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -240,9 +240,9 @@ def get_evaluation_with_scores( return eval_run, None # Caching strategy: On the first request, trace scores are fetched from Langfuse - # and a copy is stored in S3 (and DB as fallback). Subsequent requests serve from - # S3/DB instead of Langfuse, which is significantly faster. Use resync_score=true - # to bypass the cache and re-fetch from Langfuse (e.g., when new evaluators have run). + # and a copy is stored in S3. Subsequent requests serve from + # S3 instead of Langfuse, which is significantly faster. Use resync_score=true + # to bypass the cache and re-fetch from Langfuse and store to S3 again. has_cached_traces_s3 = eval_run.score_trace_url is not None has_cached_traces_db = eval_run.score is not None and "traces" in eval_run.score if not resync_score: