Enable `cache_params` to work with `generate()` from `GenerationMixin`

by FremyCompany - opened Apr 16, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+20

-15

Files changed (1) hide show

modeling_nemotron_h.py +20 -15

modeling_nemotron_h.py CHANGED Viewed

@@ -31,6 +31,9 @@ from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     ModelOutput,
     add_code_sample_docstrings,
@@ -168,12 +171,14 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
     def __init__(self, config, batch_size, dtype=torch.float16, device=None):
         super().__init__()
         self.dtype = dtype
         self.hybrid_override_pattern = config.hybrid_override_pattern
         self.has_previous_state = False  # only used by mamba
-        intermediate_size = config.expand * config.hidden_size
-        ssm_state_size = config.ssm_state_size
-        conv_kernel_size = config.conv_kernel
         self.conv_states = []
         self.ssm_states = []
         self.transformer_layers = []
@@ -181,10 +186,10 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
             if self.hybrid_override_pattern[i] == "M":
                 # Mamba layer
                 self.conv_states += [
-                    torch.zeros(batch_size, intermediate_size, conv_kernel_size, device=device, dtype=dtype)
                 ]
                 self.ssm_states += [
-                    torch.zeros(batch_size, intermediate_size, ssm_state_size, device=device, dtype=dtype)
                 ]
             else:
                 # Attention or MLP layer
@@ -245,14 +250,14 @@ class HybridMambaAttentionDynamicCache(DynamicCache):
         self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False
     ) -> torch.Tensor:
         if cache_init:
-            self.conv_states[layer_idx] = new_conv_state.to(self.conv_states.device)
         else:
             self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
-            self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states.device)
         return self.conv_states[layer_idx]
     def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
-        self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states.device)
         return self.ssm_states[layer_idx]
     def reset(self):
@@ -413,7 +418,7 @@ class NemotronHMamba2Mixer(nn.Module):
                 dt_softplus=True,
             )
             hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
-            breakpoint()
             hidden_states = self.norm(hidden_states, gate)
             # 4. Final linear projection
@@ -560,7 +565,7 @@ class NemotronHMamba2Mixer(nn.Module):
         A = -torch.exp(self.A_log.float())                            # [num_heads]
         if cache_params is not None and cache_position is not None and cache_position[0] > 0:
             # We need to guarantee that anything regarding the cache is on the same device
-            cache_device = cache_params.ssm_states.device
             # Note: there is no need to pad parameter matrices here, as there is just one new token
             # for batched generation
@@ -1185,7 +1190,7 @@ class NemotronHOutput(ModelOutput):
 @dataclass
 # Copied from transformers.models.mamba2.modeling_mamba2.MambaCausalLMOutput with Mamba2->NemotronH
-class NemotronHCausalLMOutput(ModelOutput):
     """
     Base class for causal language model (or autoregressive) outputs.
@@ -1208,7 +1213,7 @@ class NemotronHCausalLMOutput(ModelOutput):
     loss: Optional[torch.FloatTensor] = None
     logits: Optional[torch.FloatTensor] = None
-    cache_params: Optional[HybridMambaAttentionDynamicCache] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
@@ -1568,7 +1573,7 @@ class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
         input_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        cache_params: Optional[HybridMambaAttentionDynamicCache] = None,
         labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1593,7 +1598,7 @@ class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
         nemotron_h_outputs = self.backbone(
             input_ids,
-            cache_params=cache_params,
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -1626,7 +1631,7 @@ class NemotronHForCausalLM(NemotronHPreTrainedModel, GenerationMixin):
         return NemotronHCausalLMOutput(
             loss=loss,
             logits=logits,
-            cache_params=nemotron_h_outputs.cache_params,
             hidden_states=nemotron_h_outputs.hidden_states,
             attentions=nemotron_h_outputs.attentions,
         )

     AttentionMaskConverter,
 )
 from transformers.modeling_utils import PreTrainedModel
+from transformers.modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+)
 from transformers.utils import (
     ModelOutput,
     add_code_sample_docstrings,
     def __init__(self, config, batch_size, dtype=torch.float16, device=None):
         super().__init__()
+        self.device=device
         self.dtype = dtype
         self.hybrid_override_pattern = config.hybrid_override_pattern
         self.has_previous_state = False  # only used by mamba
+        self.intermediate_size = config.expand * config.hidden_size
+        self.ssm_state_size = config.ssm_state_size
+        self.conv_kernel_size = config.conv_kernel
+        self.conv_dim = self.intermediate_size + 2 * config.n_groups * config.ssm_state_size
         self.conv_states = []
         self.ssm_states = []
         self.transformer_layers = []
             if self.hybrid_override_pattern[i] == "M":
                 # Mamba layer
                 self.conv_states += [
+                    torch.zeros(batch_size, self.conv_dim, self.conv_kernel_size, device=device, dtype=dtype)
                 ]
                 self.ssm_states += [
+                    torch.zeros(batch_size, self.intermediate_size, self.ssm_state_size, device=device, dtype=dtype)
                 ]
             else:
                 # Attention or MLP layer
         self, layer_idx: int, new_conv_state: torch.Tensor, cache_init: bool = False
     ) -> torch.Tensor:
         if cache_init:
+            self.conv_states[layer_idx] = new_conv_state.to(self.device)
         else:
             self.conv_states[layer_idx] = self.conv_states[layer_idx].roll(shifts=-1, dims=-1)
+            self.conv_states[layer_idx][:, :, -1] = new_conv_state[:, 0, :].to(self.conv_states[layer_idx].device)
         return self.conv_states[layer_idx]
     def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor):
+        self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states[layer_idx].device)
         return self.ssm_states[layer_idx]
     def reset(self):
                 dt_softplus=True,
             )
             hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim)
+            # TODO: why was there a breakpoint() call here?
             hidden_states = self.norm(hidden_states, gate)
             # 4. Final linear projection
         A = -torch.exp(self.A_log.float())                            # [num_heads]
         if cache_params is not None and cache_position is not None and cache_position[0] > 0:
             # We need to guarantee that anything regarding the cache is on the same device
+            cache_device = cache_params.ssm_states[0].device if len(cache_params.ssm_states) > 0 else cache_params.device
             # Note: there is no need to pad parameter matrices here, as there is just one new token
             # for batched generation
 @dataclass
 # Copied from transformers.models.mamba2.modeling_mamba2.MambaCausalLMOutput with Mamba2->NemotronH
+class NemotronHCausalLMOutput(MoeCausalLMOutputWithPast):
     """
     Base class for causal language model (or autoregressive) outputs.
     loss: Optional[torch.FloatTensor] = None
     logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[HybridMambaAttentionDynamicCache] = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
         input_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
         labels: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         nemotron_h_outputs = self.backbone(
             input_ids,
+            cache_params=past_key_values,
             inputs_embeds=inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
         return NemotronHCausalLMOutput(
             loss=loss,
             logits=logits,
+            past_key_values=nemotron_h_outputs.cache_params,
             hidden_states=nemotron_h_outputs.hidden_states,
             attentions=nemotron_h_outputs.attentions,
         )