diff --git a/backend/app/crud/evaluations/batch.py b/backend/app/crud/evaluations/batch.py index 1ef6f9052..13fb9a50b 100644 --- a/backend/app/crud/evaluations/batch.py +++ b/backend/app/crud/evaluations/batch.py @@ -106,12 +106,12 @@ def build_evaluation_jsonl( body: dict[str, Any] = { "model": config.model, "instructions": config.instructions, - "temperature": config.temperature - if config.temperature is not None - else 0.01, "input": question, # Add input from dataset } + if "temperature" in config.model_fields_set: + body["temperature"] = config.temperature + # Add reasoning only if provided if config.reasoning: body["reasoning"] = {"effort": config.reasoning} @@ -189,7 +189,7 @@ def start_evaluation_batch( "description": f"Evaluation: {eval_run.run_name}", "completion_window": "24h", # Store complete config for reference - "evaluation_config": config.model_dump(exclude_none=True), + "evaluation_config": config.model_dump(exclude_unset=True), } # Step 5: Start batch job using generic infrastructure diff --git a/backend/app/models/llm/constants.py b/backend/app/models/llm/constants.py index 8cb8f71b3..a604d2a50 100644 --- a/backend/app/models/llm/constants.py +++ b/backend/app/models/llm/constants.py @@ -22,6 +22,13 @@ "o1", "o1-preview", "o1-mini", + "gpt-5.4-pro", + "gpt-5.4-mini", + "gpt-5.4-nano", + "gpt-5", + "gpt-4-turbo", + "gpt-4", + "gpt-3.5-turbo", ], } diff --git a/backend/app/models/llm/request.py b/backend/app/models/llm/request.py index 1317c9ef3..8cc5f5c3e 100644 --- a/backend/app/models/llm/request.py +++ b/backend/app/models/llm/request.py @@ -251,6 +251,7 @@ def validate_params(self): provider = self.provider provider_was_auto_assigned = True + user_provided_temperature = "temperature" in self.params validated = model_class.model_validate(self.params) if provider is not None: @@ -288,6 +289,8 @@ def validate_params(self): ) self.params = validated.model_dump(exclude_none=True) + if not user_provided_temperature: + self.params.pop("temperature", None) return self diff --git a/backend/app/tests/api/routes/test_evaluation.py b/backend/app/tests/api/routes/test_evaluation.py index 4b751a59a..17d647aea 100644 --- a/backend/app/tests/api/routes/test_evaluation.py +++ b/backend/app/tests/api/routes/test_evaluation.py @@ -731,6 +731,65 @@ def test_build_batch_jsonl_multiple_items(self) -> None: assert request_dict["body"]["input"] == f"Question {i}" assert request_dict["body"]["model"] == "gpt-4o" + def test_build_batch_jsonl_temperature_included_when_explicitly_set(self) -> None: + """When temperature is explicitly set, it should appear in the JSONL body.""" + dataset_items = [ + { + "id": "item1", + "input": {"question": "Test question"}, + "expected_output": {"answer": "Test answer"}, + "metadata": {}, + } + ] + + config = TextLLMParams(model="gpt-4o", temperature=0.5) + + jsonl_data = build_evaluation_jsonl(dataset_items, config) + + assert len(jsonl_data) == 1 + assert "temperature" in jsonl_data[0]["body"] + assert jsonl_data[0]["body"]["temperature"] == 0.5 + + def test_build_batch_jsonl_temperature_excluded_when_not_set(self) -> None: + """When temperature is not explicitly set, it should NOT appear in the JSONL body.""" + dataset_items = [ + { + "id": "item1", + "input": {"question": "Test question"}, + "expected_output": {"answer": "Test answer"}, + "metadata": {}, + } + ] + + # Only model provided — temperature not in model_fields_set + config = TextLLMParams(model="gpt-4o") + + jsonl_data = build_evaluation_jsonl(dataset_items, config) + + assert len(jsonl_data) == 1 + assert "temperature" not in jsonl_data[0]["body"] + + def test_build_batch_jsonl_temperature_zero_included_when_explicitly_set( + self, + ) -> None: + """When temperature is explicitly set to 0.0, it should still appear in the body.""" + dataset_items = [ + { + "id": "item1", + "input": {"question": "Test question"}, + "expected_output": {"answer": "Test answer"}, + "metadata": {}, + } + ] + + config = TextLLMParams(model="gpt-4o", temperature=0.0) + + jsonl_data = build_evaluation_jsonl(dataset_items, config) + + assert len(jsonl_data) == 1 + assert "temperature" in jsonl_data[0]["body"] + assert jsonl_data[0]["body"]["temperature"] == 0.0 + class TestGetEvaluationRunStatus: """Test GET /evaluations/{evaluation_id} endpoint.""" diff --git a/backend/app/tests/models/__init__.py b/backend/app/tests/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/tests/models/llm/__init__.py b/backend/app/tests/models/llm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/app/tests/models/llm/test_request.py b/backend/app/tests/models/llm/test_request.py new file mode 100644 index 000000000..3d40f607a --- /dev/null +++ b/backend/app/tests/models/llm/test_request.py @@ -0,0 +1,111 @@ +import pytest +from pydantic import ValidationError + +from app.models.llm.request import KaapiCompletionConfig + + +class TestKaapiCompletionConfigTemperature: + """Test temperature handling in KaapiCompletionConfig.validate_params.""" + + def test_temperature_preserved_when_user_provides_it(self) -> None: + """When user explicitly provides temperature, it should be in params.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={ + "model": "gpt-4o", + "temperature": 0.7, + }, + ) + + assert "temperature" in config.params + assert config.params["temperature"] == 0.7 + + def test_temperature_excluded_when_user_does_not_provide_it(self) -> None: + """When user does not provide temperature, it should NOT be in params + even though TextLLMParams has a default of 0.1.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={ + "model": "gpt-4o", + }, + ) + + assert "temperature" not in config.params + + def test_temperature_zero_preserved_when_explicitly_set(self) -> None: + """When user explicitly sets temperature to 0.0, it should be preserved.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={ + "model": "gpt-4o", + "temperature": 0.0, + }, + ) + + assert "temperature" in config.params + assert config.params["temperature"] == 0.0 + + +class TestNewSupportedModels: + """Test that newly added models are accepted for openai/text provider.""" + + @pytest.mark.parametrize( + "model", + [ + "gpt-5.4-pro", + "gpt-5.4-mini", + "gpt-5.4-nano", + "gpt-5", + "gpt-4-turbo", + "gpt-4", + "gpt-3.5-turbo", + ], + ) + def test_new_model_accepted(self, model: str) -> None: + """New models should be accepted for openai text provider.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={"model": model}, + ) + + assert config.params["model"] == model + + @pytest.mark.parametrize( + "model", + [ + "gpt-4o", + "gpt-4o-mini", + "gpt-4.1", + "gpt-4.1-mini", + "gpt-4.1-nano", + "gpt-5.4", + "gpt-5.1", + "gpt-5-mini", + "gpt-5-nano", + "o1", + "o1-preview", + "o1-mini", + ], + ) + def test_existing_models_still_accepted(self, model: str) -> None: + """Previously supported models should still be accepted.""" + config = KaapiCompletionConfig( + provider="openai", + type="text", + params={"model": model}, + ) + + assert config.params["model"] == model + + def test_unsupported_model_rejected(self) -> None: + """An unsupported model should raise a validation error.""" + with pytest.raises(ValidationError, match="not supported"): + KaapiCompletionConfig( + provider="openai", + type="text", + params={"model": "unsupported-model-xyz"}, + )