Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions backend/app/crud/evaluations/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ def build_evaluation_jsonl(
body: dict[str, Any] = {
"model": config.model,
"instructions": config.instructions,
"temperature": config.temperature
if config.temperature is not None
else 0.01,
"input": question, # Add input from dataset
}

if "temperature" in config.model_fields_set:
body["temperature"] = config.temperature

# Add reasoning only if provided
if config.reasoning:
body["reasoning"] = {"effort": config.reasoning}
Expand Down Expand Up @@ -189,7 +189,7 @@ def start_evaluation_batch(
"description": f"Evaluation: {eval_run.run_name}",
"completion_window": "24h",
# Store complete config for reference
"evaluation_config": config.model_dump(exclude_none=True),
"evaluation_config": config.model_dump(exclude_unset=True),
}

# Step 5: Start batch job using generic infrastructure
Expand Down
7 changes: 7 additions & 0 deletions backend/app/models/llm/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,13 @@
"o1",
"o1-preview",
"o1-mini",
"gpt-5.4-pro",
"gpt-5.4-mini",
"gpt-5.4-nano",
"gpt-5",
"gpt-4-turbo",
"gpt-4",
"gpt-3.5-turbo",
],
}

Expand Down
3 changes: 3 additions & 0 deletions backend/app/models/llm/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def validate_params(self):
provider = self.provider
provider_was_auto_assigned = True

user_provided_temperature = "temperature" in self.params
validated = model_class.model_validate(self.params)

if provider is not None:
Expand Down Expand Up @@ -288,6 +289,8 @@ def validate_params(self):
)

self.params = validated.model_dump(exclude_none=True)
if not user_provided_temperature:
self.params.pop("temperature", None)
return self


Expand Down
59 changes: 59 additions & 0 deletions backend/app/tests/api/routes/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,65 @@ def test_build_batch_jsonl_multiple_items(self) -> None:
assert request_dict["body"]["input"] == f"Question {i}"
assert request_dict["body"]["model"] == "gpt-4o"

def test_build_batch_jsonl_temperature_included_when_explicitly_set(self) -> None:
"""When temperature is explicitly set, it should appear in the JSONL body."""
dataset_items = [
{
"id": "item1",
"input": {"question": "Test question"},
"expected_output": {"answer": "Test answer"},
"metadata": {},
}
]

config = TextLLMParams(model="gpt-4o", temperature=0.5)

jsonl_data = build_evaluation_jsonl(dataset_items, config)

assert len(jsonl_data) == 1
assert "temperature" in jsonl_data[0]["body"]
assert jsonl_data[0]["body"]["temperature"] == 0.5

def test_build_batch_jsonl_temperature_excluded_when_not_set(self) -> None:
"""When temperature is not explicitly set, it should NOT appear in the JSONL body."""
dataset_items = [
{
"id": "item1",
"input": {"question": "Test question"},
"expected_output": {"answer": "Test answer"},
"metadata": {},
}
]

# Only model provided — temperature not in model_fields_set
config = TextLLMParams(model="gpt-4o")

jsonl_data = build_evaluation_jsonl(dataset_items, config)

assert len(jsonl_data) == 1
assert "temperature" not in jsonl_data[0]["body"]

def test_build_batch_jsonl_temperature_zero_included_when_explicitly_set(
self,
) -> None:
"""When temperature is explicitly set to 0.0, it should still appear in the body."""
dataset_items = [
{
"id": "item1",
"input": {"question": "Test question"},
"expected_output": {"answer": "Test answer"},
"metadata": {},
}
]

config = TextLLMParams(model="gpt-4o", temperature=0.0)

jsonl_data = build_evaluation_jsonl(dataset_items, config)

assert len(jsonl_data) == 1
assert "temperature" in jsonl_data[0]["body"]
assert jsonl_data[0]["body"]["temperature"] == 0.0


class TestGetEvaluationRunStatus:
"""Test GET /evaluations/{evaluation_id} endpoint."""
Expand Down
Empty file.
Empty file.
111 changes: 111 additions & 0 deletions backend/app/tests/models/llm/test_request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import pytest
from pydantic import ValidationError

from app.models.llm.request import KaapiCompletionConfig


class TestKaapiCompletionConfigTemperature:
"""Test temperature handling in KaapiCompletionConfig.validate_params."""

def test_temperature_preserved_when_user_provides_it(self) -> None:
"""When user explicitly provides temperature, it should be in params."""
config = KaapiCompletionConfig(
provider="openai",
type="text",
params={
"model": "gpt-4o",
"temperature": 0.7,
},
)

assert "temperature" in config.params
assert config.params["temperature"] == 0.7

def test_temperature_excluded_when_user_does_not_provide_it(self) -> None:
"""When user does not provide temperature, it should NOT be in params
even though TextLLMParams has a default of 0.1."""
config = KaapiCompletionConfig(
provider="openai",
type="text",
params={
"model": "gpt-4o",
},
)

assert "temperature" not in config.params

def test_temperature_zero_preserved_when_explicitly_set(self) -> None:
"""When user explicitly sets temperature to 0.0, it should be preserved."""
config = KaapiCompletionConfig(
provider="openai",
type="text",
params={
"model": "gpt-4o",
"temperature": 0.0,
},
)

assert "temperature" in config.params
assert config.params["temperature"] == 0.0


class TestNewSupportedModels:
"""Test that newly added models are accepted for openai/text provider."""

@pytest.mark.parametrize(
"model",
[
"gpt-5.4-pro",
"gpt-5.4-mini",
"gpt-5.4-nano",
"gpt-5",
"gpt-4-turbo",
"gpt-4",
"gpt-3.5-turbo",
],
)
def test_new_model_accepted(self, model: str) -> None:
"""New models should be accepted for openai text provider."""
config = KaapiCompletionConfig(
provider="openai",
type="text",
params={"model": model},
)

assert config.params["model"] == model

@pytest.mark.parametrize(
"model",
[
"gpt-4o",
"gpt-4o-mini",
"gpt-4.1",
"gpt-4.1-mini",
"gpt-4.1-nano",
"gpt-5.4",
"gpt-5.1",
"gpt-5-mini",
"gpt-5-nano",
"o1",
"o1-preview",
"o1-mini",
],
)
def test_existing_models_still_accepted(self, model: str) -> None:
"""Previously supported models should still be accepted."""
config = KaapiCompletionConfig(
provider="openai",
type="text",
params={"model": model},
)

assert config.params["model"] == model

def test_unsupported_model_rejected(self) -> None:
"""An unsupported model should raise a validation error."""
with pytest.raises(ValidationError, match="not supported"):
KaapiCompletionConfig(
provider="openai",
type="text",
params={"model": "unsupported-model-xyz"},
)
Loading