From c33d155669cfbd38f704640877bbd10e3831acdd Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Thu, 19 Mar 2026 16:44:27 +0530
Subject: [PATCH 1/4] syncing traces with missing score as well

---
 api-contract-changes.md                       | 83 +++++++++++++++++++
 backend/app/crud/evaluations/langfuse.py      | 23 +----
 .../app/services/evaluations/evaluation.py    |  9 +-
 3 files changed, 92 insertions(+), 23 deletions(-)
 create mode 100644 api-contract-changes.md

diff --git a/api-contract-changes.md b/api-contract-changes.md
new file mode 100644
index 000000000..b16958bd9
--- /dev/null
+++ b/api-contract-changes.md
@@ -0,0 +1,83 @@
+# API Contract Changes — v0.8.0 (v0.7.0 → v0.8.0)
+
+## New Endpoints
+
+| Endpoint | Method | Description |
+|---|---|---|
+| `/llm/chain` | POST | Sequential chain execution of LLM calls |
+| `/evaluations/stt/files` | GET | List audio files |
+| `/evaluations/stt/files/{file_id}` | GET | Get single audio file |
+| `/evaluations/stt/samples/{sample_id}` | PATCH | Update STT sample (language, ground truth) |
+| `/evaluations/tts/datasets` | POST | Create TTS evaluation dataset |
+| `/evaluations/tts/datasets` | GET | List TTS datasets |
+| `/evaluations/tts/datasets/{dataset_id}` | GET | Get single TTS dataset |
+| `/evaluations/tts/runs` | POST | Start a TTS evaluation run |
+| `/evaluations/tts/runs` | GET | List TTS evaluation runs |
+| `/evaluations/tts/runs/{run_id}` | GET | Get TTS run with results |
+| `/evaluations/tts/results/{result_id}` | PATCH | Update human feedback on TTS result |
+| `/evaluations/tts/results/{result_id}` | GET | Get a TTS result |
+
+## Breaking Changes
+
+### Validation Error Response Format
+
+The 422 response body now returns a structured `errors` array instead of a concatenated string. `APIResponse` has a new field:
+
+```
+errors: Optional[list[ValidationErrorDetail]]  # each item: { field, message }
+```
+
+**Clients parsing validation error responses will need to update.**
+
+### LLM Request Model Changes
+
+| Field | Old | New |
+|---|---|---|
+| `KaapiCompletionConfig.provider` | `Literal["openai", "google"]` | `Literal["openai", "google", "sarvamai", "elevenlabs"] \| None` (optional, auto-resolves) |
+| `NativeCompletionConfig.provider` | `Literal["openai-native", "google-native"]` | `Literal["openai-native", "google-native", "sarvamai-native", "elevenlabs-native"]` |
+| `QueryInput` union | `TextInput \| AudioInput` | `TextInput \| AudioInput \| ImageInput \| PDFInput` |
+| `QueryParams.input` | `str \| QueryInput` | `str \| QueryInput \| list[QueryInput]` (list enables mixing modalities) |
+| `LlmCall.input_type` | `Literal["text", "audio", "image"]` | `Literal["text", "audio", "image", "pdf", "multimodal"]` |
+
+## Default Value Changes
+
+| Field | Old Default | New Default |
+|---|---|---|
+| `TextLLMParams.temperature` | `None` | `0.1` |
+| `STTLLMParams.model` | required | `"gemini-2.5-pro"` |
+| `STTLLMParams.instructions` | required `str` | `None` (optional) |
+| `STTLLMParams.input_language` | `None` | `"auto"` |
+| `STTLLMParams.temperature` min | `0.0` | `0.01` |
+| `TTSLLMParams.model` | required | `"gemini-2.5-flash-preview-tts"` |
+| `TTSLLMParams.voice` | required | `"Kore"` |
+| `TTSLLMParams.language` | required `str` | `None` (optional) |
+| `CollectionOptions.batch_size` | `1` | `10` |
+
+## Other Contract Changes
+
+- **`STTFeedbackUpdate`**: `is_correct` and `comment` changed from required → optional
+- **`ConfigBlob`**: added optional `prompt_template` field for chain `{{input}}` interpolation
+- **`DatasetUploadResponse`**: added `description` and `signed_url` fields
+- **`FilePublic`**: added `signed_url` field
+- **`STTSamplePublic`**: added `signed_url` field
+- **`STTSampleCreate`**: added optional `language_id` field (per-sample language override)
+- **Signed URL support**: added `include_signed_url` query param to GET endpoints for datasets, STT datasets/runs, and TTS endpoints
+- **New enums**: `BatchJobType.LLM_CHAIN`, `JobType.LLM_CHAIN`, `ChainStatus`
+
+## New Providers
+
+- **Sarvam AI** (`sarvamai`) — requires `api_key`
+- **ElevenLabs** (`elevenlabs`) — requires `api_key`
+
+## Database Migrations
+
+- **048**: `llm_chain` table + `chain_id` FK on `llm_call` + `LLM_CHAIN` job type enum
+- **049**: `tts_result` table with indexes (`ix_tts_result_run_id`, `idx_tts_result_feedback`, `idx_tts_result_status`)
+
+## Key Architectural Additions
+
+1. **Multimodal inputs** — image & PDF support in `/llm/call`
+2. **LLM Chain** — sequential multi-block execution with intermediate callbacks
+3. **TTS Evaluation** — complete evaluation suite parallel to STT evaluation
+4. **ElevenLabs + Sarvam** — two new STT/TTS providers
+5. **Smart defaults** — auto provider/model selection for STT/TTS
diff --git a/backend/app/crud/evaluations/langfuse.py b/backend/app/crud/evaluations/langfuse.py
index 1dd0c519e..139aeca44 100644
--- a/backend/app/crud/evaluations/langfuse.py
+++ b/backend/app/crud/evaluations/langfuse.py
@@ -492,28 +492,9 @@ def fetch_trace_scores_from_langfuse(
                 )
                 continue
 
-        # 4. Identify complete scores (all traces must have the score)
-        total_traces = len(traces)
-        complete_score_names = {
-            name
-            for name, data in score_aggregations.items()
-            if len(data["values"]) == total_traces
-        }
-
-        # 5. Filter trace scores to only include complete scores
-        for trace in traces:
-            trace["scores"] = [
-                score
-                for score in trace["scores"]
-                if score["name"] in complete_score_names
-            ]
-
-        # 6. Calculate summary scores (only for complete scores)
+        # 4. Calculate summary scores for all scores that have at least one value
         summary_scores = []
         for score_name, agg_data in score_aggregations.items():
-            if score_name not in complete_score_names:
-                continue
-
             data_type = agg_data["data_type"]
             values = agg_data["values"]
 
@@ -552,7 +533,7 @@ def fetch_trace_scores_from_langfuse(
 
         logger.info(
             f"[fetch_trace_scores_from_langfuse] Successfully fetched scores | "
-            f"total_traces={len(traces)} | complete_scores={list(complete_score_names)}"
+            f"total_traces={len(traces)} | score_names={list(score_aggregations.keys())}"
         )
 
         return result
diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py
index 594122cb4..3218445c1 100644
--- a/backend/app/services/evaluations/evaluation.py
+++ b/backend/app/services/evaluations/evaluation.py
@@ -239,7 +239,10 @@ def get_evaluation_with_scores(
     if not get_trace_info:
         return eval_run, None
 
-    # Check if we already have cached traces
+    # Caching strategy: On the first request, trace scores are fetched from Langfuse
+    # and a copy is stored in S3 (and DB as fallback). Subsequent requests serve from
+    # S3/DB instead of Langfuse, which is significantly faster. Use resync_score=true
+    # to bypass the cache and re-fetch from Langfuse (e.g., when new evaluators have run).
     has_cached_traces_s3 = eval_run.score_trace_url is not None
     has_cached_traces_db = eval_run.score is not None and "traces" in eval_run.score
     if not resync_score:
@@ -276,6 +279,7 @@ def get_evaluation_with_scores(
             )
             return eval_run, None
 
+    # Cache miss or resync requested — fetch fresh scores from Langfuse
     langfuse = get_langfuse_client(
         session=session,
         org_id=organization_id,
@@ -310,6 +314,7 @@ def get_evaluation_with_scores(
         )
         return eval_run, f"Failed to fetch trace info from Langfuse: {str(e)}"
 
+    print(langfuse_score)
     # Merge summary_scores: existing scores + new scores from Langfuse
     # Create a map of existing scores by name
     existing_scores_map = {s["name"]: s for s in existing_summary_scores}
@@ -321,7 +326,7 @@ def get_evaluation_with_scores(
 
     merged_summary_scores = list(existing_scores_map.values())
 
-    # Build final score with merged summary_scores and traces
+    # Build final score and persist to S3/DB for future cached reads
     score = {
         "summary_scores": merged_summary_scores,
         "traces": langfuse_score.get("traces", []),

From edd2d27810e219aca1336e2fee2b724a883224ed Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Fri, 20 Mar 2026 15:47:53 +0530
Subject: [PATCH 2/4] cleanup

---
 api-contract-changes.md | 83 -----------------------------------------
 1 file changed, 83 deletions(-)
 delete mode 100644 api-contract-changes.md

diff --git a/api-contract-changes.md b/api-contract-changes.md
deleted file mode 100644
index b16958bd9..000000000
--- a/api-contract-changes.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# API Contract Changes — v0.8.0 (v0.7.0 → v0.8.0)
-
-## New Endpoints
-
-| Endpoint | Method | Description |
-|---|---|---|
-| `/llm/chain` | POST | Sequential chain execution of LLM calls |
-| `/evaluations/stt/files` | GET | List audio files |
-| `/evaluations/stt/files/{file_id}` | GET | Get single audio file |
-| `/evaluations/stt/samples/{sample_id}` | PATCH | Update STT sample (language, ground truth) |
-| `/evaluations/tts/datasets` | POST | Create TTS evaluation dataset |
-| `/evaluations/tts/datasets` | GET | List TTS datasets |
-| `/evaluations/tts/datasets/{dataset_id}` | GET | Get single TTS dataset |
-| `/evaluations/tts/runs` | POST | Start a TTS evaluation run |
-| `/evaluations/tts/runs` | GET | List TTS evaluation runs |
-| `/evaluations/tts/runs/{run_id}` | GET | Get TTS run with results |
-| `/evaluations/tts/results/{result_id}` | PATCH | Update human feedback on TTS result |
-| `/evaluations/tts/results/{result_id}` | GET | Get a TTS result |
-
-## Breaking Changes
-
-### Validation Error Response Format
-
-The 422 response body now returns a structured `errors` array instead of a concatenated string. `APIResponse` has a new field:
-
-```
-errors: Optional[list[ValidationErrorDetail]]  # each item: { field, message }
-```
-
-**Clients parsing validation error responses will need to update.**
-
-### LLM Request Model Changes
-
-| Field | Old | New |
-|---|---|---|
-| `KaapiCompletionConfig.provider` | `Literal["openai", "google"]` | `Literal["openai", "google", "sarvamai", "elevenlabs"] \| None` (optional, auto-resolves) |
-| `NativeCompletionConfig.provider` | `Literal["openai-native", "google-native"]` | `Literal["openai-native", "google-native", "sarvamai-native", "elevenlabs-native"]` |
-| `QueryInput` union | `TextInput \| AudioInput` | `TextInput \| AudioInput \| ImageInput \| PDFInput` |
-| `QueryParams.input` | `str \| QueryInput` | `str \| QueryInput \| list[QueryInput]` (list enables mixing modalities) |
-| `LlmCall.input_type` | `Literal["text", "audio", "image"]` | `Literal["text", "audio", "image", "pdf", "multimodal"]` |
-
-## Default Value Changes
-
-| Field | Old Default | New Default |
-|---|---|---|
-| `TextLLMParams.temperature` | `None` | `0.1` |
-| `STTLLMParams.model` | required | `"gemini-2.5-pro"` |
-| `STTLLMParams.instructions` | required `str` | `None` (optional) |
-| `STTLLMParams.input_language` | `None` | `"auto"` |
-| `STTLLMParams.temperature` min | `0.0` | `0.01` |
-| `TTSLLMParams.model` | required | `"gemini-2.5-flash-preview-tts"` |
-| `TTSLLMParams.voice` | required | `"Kore"` |
-| `TTSLLMParams.language` | required `str` | `None` (optional) |
-| `CollectionOptions.batch_size` | `1` | `10` |
-
-## Other Contract Changes
-
-- **`STTFeedbackUpdate`**: `is_correct` and `comment` changed from required → optional
-- **`ConfigBlob`**: added optional `prompt_template` field for chain `{{input}}` interpolation
-- **`DatasetUploadResponse`**: added `description` and `signed_url` fields
-- **`FilePublic`**: added `signed_url` field
-- **`STTSamplePublic`**: added `signed_url` field
-- **`STTSampleCreate`**: added optional `language_id` field (per-sample language override)
-- **Signed URL support**: added `include_signed_url` query param to GET endpoints for datasets, STT datasets/runs, and TTS endpoints
-- **New enums**: `BatchJobType.LLM_CHAIN`, `JobType.LLM_CHAIN`, `ChainStatus`
-
-## New Providers
-
-- **Sarvam AI** (`sarvamai`) — requires `api_key`
-- **ElevenLabs** (`elevenlabs`) — requires `api_key`
-
-## Database Migrations
-
-- **048**: `llm_chain` table + `chain_id` FK on `llm_call` + `LLM_CHAIN` job type enum
-- **049**: `tts_result` table with indexes (`ix_tts_result_run_id`, `idx_tts_result_feedback`, `idx_tts_result_status`)
-
-## Key Architectural Additions
-
-1. **Multimodal inputs** — image & PDF support in `/llm/call`
-2. **LLM Chain** — sequential multi-block execution with intermediate callbacks
-3. **TTS Evaluation** — complete evaluation suite parallel to STT evaluation
-4. **ElevenLabs + Sarvam** — two new STT/TTS providers
-5. **Smart defaults** — auto provider/model selection for STT/TTS

From 2c82e477b22c96e9f8ab4a358003f20b2662b98f Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Fri, 20 Mar 2026 15:51:34 +0530
Subject: [PATCH 3/4] cleanup

---
 backend/app/services/evaluations/evaluation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py
index 3218445c1..b115f109a 100644
--- a/backend/app/services/evaluations/evaluation.py
+++ b/backend/app/services/evaluations/evaluation.py
@@ -279,7 +279,7 @@ def get_evaluation_with_scores(
             )
             return eval_run, None
 
-    # Cache miss or resync requested — fetch fresh scores from Langfuse
+    # Resync requested — fetch fresh scores from Langfuse
     langfuse = get_langfuse_client(
         session=session,
         org_id=organization_id,
@@ -314,7 +314,6 @@ def get_evaluation_with_scores(
         )
         return eval_run, f"Failed to fetch trace info from Langfuse: {str(e)}"
 
-    print(langfuse_score)
     # Merge summary_scores: existing scores + new scores from Langfuse
     # Create a map of existing scores by name
     existing_scores_map = {s["name"]: s for s in existing_summary_scores}

From ff7e77cc1230764f82302373ea7331527755fe2d Mon Sep 17 00:00:00 2001
From: AkhileshNegi <akhileshnegi.an3@gmail.com>
Date: Fri, 20 Mar 2026 15:53:01 +0530
Subject: [PATCH 4/4] cleanup

---
 backend/app/services/evaluations/evaluation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py
index b115f109a..55653b253 100644
--- a/backend/app/services/evaluations/evaluation.py
+++ b/backend/app/services/evaluations/evaluation.py
@@ -240,9 +240,9 @@ def get_evaluation_with_scores(
         return eval_run, None
 
     # Caching strategy: On the first request, trace scores are fetched from Langfuse
-    # and a copy is stored in S3 (and DB as fallback). Subsequent requests serve from
-    # S3/DB instead of Langfuse, which is significantly faster. Use resync_score=true
-    # to bypass the cache and re-fetch from Langfuse (e.g., when new evaluators have run).
+    # and a copy is stored in S3. Subsequent requests serve from
+    # S3 instead of Langfuse, which is significantly faster. Use resync_score=true
+    # to bypass the cache and re-fetch from Langfuse and store to S3 again.
     has_cached_traces_s3 = eval_run.score_trace_url is not None
     has_cached_traces_db = eval_run.score is not None and "traces" in eval_run.score
     if not resync_score: