diff --git a/backend/app/crud/evaluations/langfuse.py b/backend/app/crud/evaluations/langfuse.py index 1dd0c519..139aeca4 100644 --- a/backend/app/crud/evaluations/langfuse.py +++ b/backend/app/crud/evaluations/langfuse.py @@ -492,28 +492,9 @@ def fetch_trace_scores_from_langfuse( ) continue - # 4. Identify complete scores (all traces must have the score) - total_traces = len(traces) - complete_score_names = { - name - for name, data in score_aggregations.items() - if len(data["values"]) == total_traces - } - - # 5. Filter trace scores to only include complete scores - for trace in traces: - trace["scores"] = [ - score - for score in trace["scores"] - if score["name"] in complete_score_names - ] - - # 6. Calculate summary scores (only for complete scores) + # 4. Calculate summary scores for all scores that have at least one value summary_scores = [] for score_name, agg_data in score_aggregations.items(): - if score_name not in complete_score_names: - continue - data_type = agg_data["data_type"] values = agg_data["values"] @@ -552,7 +533,7 @@ def fetch_trace_scores_from_langfuse( logger.info( f"[fetch_trace_scores_from_langfuse] Successfully fetched scores | " - f"total_traces={len(traces)} | complete_scores={list(complete_score_names)}" + f"total_traces={len(traces)} | score_names={list(score_aggregations.keys())}" ) return result diff --git a/backend/app/services/evaluations/evaluation.py b/backend/app/services/evaluations/evaluation.py index 594122cb..55653b25 100644 --- a/backend/app/services/evaluations/evaluation.py +++ b/backend/app/services/evaluations/evaluation.py @@ -239,7 +239,10 @@ def get_evaluation_with_scores( if not get_trace_info: return eval_run, None - # Check if we already have cached traces + # Caching strategy: On the first request, trace scores are fetched from Langfuse + # and a copy is stored in S3. Subsequent requests serve from + # S3 instead of Langfuse, which is significantly faster. Use resync_score=true + # to bypass the cache and re-fetch from Langfuse and store to S3 again. has_cached_traces_s3 = eval_run.score_trace_url is not None has_cached_traces_db = eval_run.score is not None and "traces" in eval_run.score if not resync_score: @@ -276,6 +279,7 @@ def get_evaluation_with_scores( ) return eval_run, None + # Resync requested — fetch fresh scores from Langfuse langfuse = get_langfuse_client( session=session, org_id=organization_id, @@ -321,7 +325,7 @@ def get_evaluation_with_scores( merged_summary_scores = list(existing_scores_map.values()) - # Build final score with merged summary_scores and traces + # Build final score and persist to S3/DB for future cached reads score = { "summary_scores": merged_summary_scores, "traces": langfuse_score.get("traces", []),