SreyanG-NVIDIA commited on
Commit
473eee7
·
verified ·
1 Parent(s): 2427802

Add think-mode peft example

Browse files
Files changed (1) hide show
  1. README.md +53 -0
README.md CHANGED
@@ -244,6 +244,59 @@ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
244
  print(decoded_outputs)
245
  ```
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  ### Training / Fine-tuning
248
 
249
  ```python
 
244
  print(decoded_outputs)
245
  ```
246
 
247
+ ### Think-mode reasoning with PEFT adapter (AF-Think)
248
+
249
+ ```python
250
+ import os
251
+
252
+ import torch
253
+ from huggingface_hub import snapshot_download
254
+ from peft import PeftModel
255
+
256
+ from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
257
+
258
+
259
+ model_id = "nvidia/audio-flamingo-3-hf"
260
+ local_id = snapshot_download(model_id)
261
+
262
+ processor = AutoProcessor.from_pretrained(local_id)
263
+ model = AudioFlamingo3ForConditionalGeneration.from_pretrained(local_id, device_map="auto")
264
+
265
+ non_lora_path = os.path.join(local_id, "think", "non_lora_trainables.bin")
266
+ non_lora_trainables = torch.load(non_lora_path)
267
+ model.load_state_dict(non_lora_trainables, strict=False)
268
+
269
+ model = PeftModel.from_pretrained(model, local_id, subfolder="think")
270
+
271
+ conversation = [
272
+ {
273
+ "role": "user",
274
+ "content": [
275
+ {
276
+ "type": "text",
277
+ "text": "Generate a detailed caption for the input audio, describing all notable speech, sound, and musical events comprehensively. In the caption, transcribe all spoken content by all speakers in the audio precisely.\nPlease think and reason about the input music before you respond.",
278
+ },
279
+ {
280
+ "type": "audio",
281
+ "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/videoplayback_superman.wav",
282
+ },
283
+ ],
284
+ }
285
+ ]
286
+
287
+ inputs = processor.apply_chat_template(
288
+ conversation,
289
+ tokenize=True,
290
+ add_generation_prompt=True,
291
+ return_dict=True,
292
+ ).to(model.device)
293
+
294
+ outputs = model.generate(**inputs, max_new_tokens=1024)
295
+
296
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
297
+ print(decoded_outputs)
298
+ ```
299
+
300
  ### Training / Fine-tuning
301
 
302
  ```python