williyam commited on
Commit
9955cc3
·
1 Parent(s): a8780b7

fix: tighten reward/grade bounds to [0.01, 0.99] in models, orchestrator, and tests

Browse files
rag_master/models.py CHANGED
@@ -78,7 +78,7 @@ class Trajectory(BaseModel):
78
  task_id: str
79
  steps: List[StepRecord] = Field(default_factory=list)
80
  total_reward: float = 0.0
81
- final_score: float = 0.0
82
  completed: bool = False
83
  metadata: Dict[str, Any] = Field(default_factory=dict)
84
 
 
78
  task_id: str
79
  steps: List[StepRecord] = Field(default_factory=list)
80
  total_reward: float = 0.0
81
+ final_score: float = Field(default=0.01, ge=0.01, le=0.99)
82
  completed: bool = False
83
  metadata: Dict[str, Any] = Field(default_factory=dict)
84
 
rag_master/orchestrator.py CHANGED
@@ -137,7 +137,7 @@ class Orchestrator:
137
  if self._state.done:
138
  return {
139
  "observation": self._build_observation(),
140
- "reward": 0.0,
141
  "done": True,
142
  "info": {"message": "Episode already completed."},
143
  }
@@ -280,10 +280,10 @@ class Orchestrator:
280
  ],
281
  "query_history": self._state.query_history[-5:],
282
  "current_answer": self._state.generated_answer[:1000],
283
- "last_reward": (
284
  self._state.intermediate_rewards[-1]
285
  if self._state.intermediate_rewards
286
- else 0.0
287
  ),
288
  "done": self._state.done,
289
  }
 
137
  if self._state.done:
138
  return {
139
  "observation": self._build_observation(),
140
+ "reward": clamp_score(0.01),
141
  "done": True,
142
  "info": {"message": "Episode already completed."},
143
  }
 
280
  ],
281
  "query_history": self._state.query_history[-5:],
282
  "current_answer": self._state.generated_answer[:1000],
283
+ "last_reward": clamp_score(
284
  self._state.intermediate_rewards[-1]
285
  if self._state.intermediate_rewards
286
+ else 0.01
287
  ),
288
  "done": self._state.done,
289
  }
server/models.py CHANGED
@@ -60,14 +60,14 @@ class Observation(BaseModel):
60
  default_factory=list, description="Recent query history."
61
  )
62
  current_answer: str = Field(default="", description="Current answer draft.")
63
- last_reward: float = Field(default=0.0, description="Reward from last step.")
64
  done: bool = Field(default=False, description="Whether the episode has ended.")
65
 
66
 
67
  class Reward(BaseModel):
68
  """Reward signal from the environment."""
69
 
70
- value: float = Field(ge=0.0, le=1.0, description="Reward value.")
71
  breakdown: Dict[str, float] = Field(
72
  default_factory=dict, description="Reward component breakdown."
73
  )
@@ -77,7 +77,7 @@ class StepResult(BaseModel):
77
  """Result of a single environment step."""
78
 
79
  observation: Observation
80
- reward: float = Field(ge=0.0, le=1.0)
81
  done: bool = Field(default=False)
82
  info: Dict[str, Any] = Field(default_factory=dict)
83
 
@@ -121,7 +121,7 @@ class GradeResult(BaseModel):
121
  """Grading result for a task."""
122
 
123
  task_id: str
124
- score: float = Field(ge=0.0, le=1.0)
125
  episode_id: str = Field(default="")
126
 
127
 
 
60
  default_factory=list, description="Recent query history."
61
  )
62
  current_answer: str = Field(default="", description="Current answer draft.")
63
+ last_reward: float = Field(default=0.01, ge=0.01, le=0.99, description="Reward from last step (strictly within [0.01, 0.99]).")
64
  done: bool = Field(default=False, description="Whether the episode has ended.")
65
 
66
 
67
  class Reward(BaseModel):
68
  """Reward signal from the environment."""
69
 
70
+ value: float = Field(ge=0.01, le=0.99, description="Reward value (strictly within [0.01, 0.99]).")
71
  breakdown: Dict[str, float] = Field(
72
  default_factory=dict, description="Reward component breakdown."
73
  )
 
77
  """Result of a single environment step."""
78
 
79
  observation: Observation
80
+ reward: float = Field(ge=0.01, le=0.99)
81
  done: bool = Field(default=False)
82
  info: Dict[str, Any] = Field(default_factory=dict)
83
 
 
121
  """Grading result for a task."""
122
 
123
  task_id: str
124
+ score: float = Field(ge=0.01, le=0.99)
125
  episode_id: str = Field(default="")
126
 
127
 
tests/test_integration.py CHANGED
@@ -158,7 +158,7 @@ class TestFullEpisodeIntegration:
158
  resp = await client.post("/grade", json={})
159
  assert resp.status_code == 200
160
  grade = resp.json()
161
- assert 0.0 <= grade["score"] <= 1.0
162
 
163
  @pytest.mark.asyncio
164
  async def test_reset_clears_previous_episode(self, client: AsyncClient) -> None:
 
158
  resp = await client.post("/grade", json={})
159
  assert resp.status_code == 200
160
  grade = resp.json()
161
+ assert 0.01 <= grade["score"] <= 0.99
162
 
163
  @pytest.mark.asyncio
164
  async def test_reset_clears_previous_episode(self, client: AsyncClient) -> None:
tests/test_orchestrator.py CHANGED
@@ -152,7 +152,7 @@ class TestOrchestrator:
152
  assert "observation" in result
153
  assert "reward" in result
154
  assert "done" in result
155
- assert 0.0 <= result["reward"] <= 1.0
156
 
157
  @pytest.mark.asyncio
158
  async def test_step_reason(self, orchestrator: Orchestrator) -> None:
@@ -197,7 +197,7 @@ class TestOrchestrator:
197
  # Now episode is done, further steps should return done=True with 0 reward
198
  result = await orchestrator.step({"type": "retrieve"})
199
  assert result["done"] is True
200
- assert result["reward"] == 0.0
201
 
202
  @pytest.mark.asyncio
203
  async def test_state_before_reset(self, orchestrator: Orchestrator) -> None:
 
152
  assert "observation" in result
153
  assert "reward" in result
154
  assert "done" in result
155
+ assert 0.01 <= result["reward"] <= 0.99
156
 
157
  @pytest.mark.asyncio
158
  async def test_step_reason(self, orchestrator: Orchestrator) -> None:
 
197
  # Now episode is done, further steps should return done=True with 0 reward
198
  result = await orchestrator.step({"type": "retrieve"})
199
  assert result["done"] is True
200
+ assert result["reward"] == 0.01
201
 
202
  @pytest.mark.asyncio
203
  async def test_state_before_reset(self, orchestrator: Orchestrator) -> None: