Spaces:
Sleeping
Sleeping
fix: tighten reward/grade bounds to [0.01, 0.99] in models, orchestrator, and tests
Browse files- rag_master/models.py +1 -1
- rag_master/orchestrator.py +3 -3
- server/models.py +4 -4
- tests/test_integration.py +1 -1
- tests/test_orchestrator.py +2 -2
rag_master/models.py
CHANGED
|
@@ -78,7 +78,7 @@ class Trajectory(BaseModel):
|
|
| 78 |
task_id: str
|
| 79 |
steps: List[StepRecord] = Field(default_factory=list)
|
| 80 |
total_reward: float = 0.0
|
| 81 |
-
final_score: float = 0.0
|
| 82 |
completed: bool = False
|
| 83 |
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 84 |
|
|
|
|
| 78 |
task_id: str
|
| 79 |
steps: List[StepRecord] = Field(default_factory=list)
|
| 80 |
total_reward: float = 0.0
|
| 81 |
+
final_score: float = Field(default=0.01, ge=0.01, le=0.99)
|
| 82 |
completed: bool = False
|
| 83 |
metadata: Dict[str, Any] = Field(default_factory=dict)
|
| 84 |
|
rag_master/orchestrator.py
CHANGED
|
@@ -137,7 +137,7 @@ class Orchestrator:
|
|
| 137 |
if self._state.done:
|
| 138 |
return {
|
| 139 |
"observation": self._build_observation(),
|
| 140 |
-
"reward": 0.
|
| 141 |
"done": True,
|
| 142 |
"info": {"message": "Episode already completed."},
|
| 143 |
}
|
|
@@ -280,10 +280,10 @@ class Orchestrator:
|
|
| 280 |
],
|
| 281 |
"query_history": self._state.query_history[-5:],
|
| 282 |
"current_answer": self._state.generated_answer[:1000],
|
| 283 |
-
"last_reward": (
|
| 284 |
self._state.intermediate_rewards[-1]
|
| 285 |
if self._state.intermediate_rewards
|
| 286 |
-
else 0.
|
| 287 |
),
|
| 288 |
"done": self._state.done,
|
| 289 |
}
|
|
|
|
| 137 |
if self._state.done:
|
| 138 |
return {
|
| 139 |
"observation": self._build_observation(),
|
| 140 |
+
"reward": clamp_score(0.01),
|
| 141 |
"done": True,
|
| 142 |
"info": {"message": "Episode already completed."},
|
| 143 |
}
|
|
|
|
| 280 |
],
|
| 281 |
"query_history": self._state.query_history[-5:],
|
| 282 |
"current_answer": self._state.generated_answer[:1000],
|
| 283 |
+
"last_reward": clamp_score(
|
| 284 |
self._state.intermediate_rewards[-1]
|
| 285 |
if self._state.intermediate_rewards
|
| 286 |
+
else 0.01
|
| 287 |
),
|
| 288 |
"done": self._state.done,
|
| 289 |
}
|
server/models.py
CHANGED
|
@@ -60,14 +60,14 @@ class Observation(BaseModel):
|
|
| 60 |
default_factory=list, description="Recent query history."
|
| 61 |
)
|
| 62 |
current_answer: str = Field(default="", description="Current answer draft.")
|
| 63 |
-
last_reward: float = Field(default=0.0, description="Reward from last step.")
|
| 64 |
done: bool = Field(default=False, description="Whether the episode has ended.")
|
| 65 |
|
| 66 |
|
| 67 |
class Reward(BaseModel):
|
| 68 |
"""Reward signal from the environment."""
|
| 69 |
|
| 70 |
-
value: float = Field(ge=0.
|
| 71 |
breakdown: Dict[str, float] = Field(
|
| 72 |
default_factory=dict, description="Reward component breakdown."
|
| 73 |
)
|
|
@@ -77,7 +77,7 @@ class StepResult(BaseModel):
|
|
| 77 |
"""Result of a single environment step."""
|
| 78 |
|
| 79 |
observation: Observation
|
| 80 |
-
reward: float = Field(ge=0.
|
| 81 |
done: bool = Field(default=False)
|
| 82 |
info: Dict[str, Any] = Field(default_factory=dict)
|
| 83 |
|
|
@@ -121,7 +121,7 @@ class GradeResult(BaseModel):
|
|
| 121 |
"""Grading result for a task."""
|
| 122 |
|
| 123 |
task_id: str
|
| 124 |
-
score: float = Field(ge=0.
|
| 125 |
episode_id: str = Field(default="")
|
| 126 |
|
| 127 |
|
|
|
|
| 60 |
default_factory=list, description="Recent query history."
|
| 61 |
)
|
| 62 |
current_answer: str = Field(default="", description="Current answer draft.")
|
| 63 |
+
last_reward: float = Field(default=0.01, ge=0.01, le=0.99, description="Reward from last step (strictly within [0.01, 0.99]).")
|
| 64 |
done: bool = Field(default=False, description="Whether the episode has ended.")
|
| 65 |
|
| 66 |
|
| 67 |
class Reward(BaseModel):
|
| 68 |
"""Reward signal from the environment."""
|
| 69 |
|
| 70 |
+
value: float = Field(ge=0.01, le=0.99, description="Reward value (strictly within [0.01, 0.99]).")
|
| 71 |
breakdown: Dict[str, float] = Field(
|
| 72 |
default_factory=dict, description="Reward component breakdown."
|
| 73 |
)
|
|
|
|
| 77 |
"""Result of a single environment step."""
|
| 78 |
|
| 79 |
observation: Observation
|
| 80 |
+
reward: float = Field(ge=0.01, le=0.99)
|
| 81 |
done: bool = Field(default=False)
|
| 82 |
info: Dict[str, Any] = Field(default_factory=dict)
|
| 83 |
|
|
|
|
| 121 |
"""Grading result for a task."""
|
| 122 |
|
| 123 |
task_id: str
|
| 124 |
+
score: float = Field(ge=0.01, le=0.99)
|
| 125 |
episode_id: str = Field(default="")
|
| 126 |
|
| 127 |
|
tests/test_integration.py
CHANGED
|
@@ -158,7 +158,7 @@ class TestFullEpisodeIntegration:
|
|
| 158 |
resp = await client.post("/grade", json={})
|
| 159 |
assert resp.status_code == 200
|
| 160 |
grade = resp.json()
|
| 161 |
-
assert 0.
|
| 162 |
|
| 163 |
@pytest.mark.asyncio
|
| 164 |
async def test_reset_clears_previous_episode(self, client: AsyncClient) -> None:
|
|
|
|
| 158 |
resp = await client.post("/grade", json={})
|
| 159 |
assert resp.status_code == 200
|
| 160 |
grade = resp.json()
|
| 161 |
+
assert 0.01 <= grade["score"] <= 0.99
|
| 162 |
|
| 163 |
@pytest.mark.asyncio
|
| 164 |
async def test_reset_clears_previous_episode(self, client: AsyncClient) -> None:
|
tests/test_orchestrator.py
CHANGED
|
@@ -152,7 +152,7 @@ class TestOrchestrator:
|
|
| 152 |
assert "observation" in result
|
| 153 |
assert "reward" in result
|
| 154 |
assert "done" in result
|
| 155 |
-
assert 0.
|
| 156 |
|
| 157 |
@pytest.mark.asyncio
|
| 158 |
async def test_step_reason(self, orchestrator: Orchestrator) -> None:
|
|
@@ -197,7 +197,7 @@ class TestOrchestrator:
|
|
| 197 |
# Now episode is done, further steps should return done=True with 0 reward
|
| 198 |
result = await orchestrator.step({"type": "retrieve"})
|
| 199 |
assert result["done"] is True
|
| 200 |
-
assert result["reward"] == 0.
|
| 201 |
|
| 202 |
@pytest.mark.asyncio
|
| 203 |
async def test_state_before_reset(self, orchestrator: Orchestrator) -> None:
|
|
|
|
| 152 |
assert "observation" in result
|
| 153 |
assert "reward" in result
|
| 154 |
assert "done" in result
|
| 155 |
+
assert 0.01 <= result["reward"] <= 0.99
|
| 156 |
|
| 157 |
@pytest.mark.asyncio
|
| 158 |
async def test_step_reason(self, orchestrator: Orchestrator) -> None:
|
|
|
|
| 197 |
# Now episode is done, further steps should return done=True with 0 reward
|
| 198 |
result = await orchestrator.step({"type": "retrieve"})
|
| 199 |
assert result["done"] is True
|
| 200 |
+
assert result["reward"] == 0.01
|
| 201 |
|
| 202 |
@pytest.mark.asyncio
|
| 203 |
async def test_state_before_reset(self, orchestrator: Orchestrator) -> None:
|