nvidia
/

audio-flamingo-3-hf

Audio-Text-to-Text

text2text-generation

audio understanding

Model card Files Files and versions

SreyanG-NVIDIA commited on about 1 month ago

Commit

473eee7

·

verified ·

1 Parent(s): 2427802

Add think-mode peft example

Files changed (1) hide show

README.md +53 -0

README.md CHANGED Viewed

@@ -244,6 +244,59 @@ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:],
 print(decoded_outputs)
 ```
 ### Training / Fine-tuning
 ```python

 print(decoded_outputs)
 ```
+### Think-mode reasoning with PEFT adapter (AF-Think)
+```python
+import os
+import torch
+from huggingface_hub import snapshot_download
+from peft import PeftModel
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+model_id = "nvidia/audio-flamingo-3-hf"
+local_id = snapshot_download(model_id)
+processor = AutoProcessor.from_pretrained(local_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(local_id, device_map="auto")
+non_lora_path = os.path.join(local_id, "think", "non_lora_trainables.bin")
+non_lora_trainables = torch.load(non_lora_path)
+model.load_state_dict(non_lora_trainables, strict=False)
+model = PeftModel.from_pretrained(model, local_id, subfolder="think")
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Generate a detailed caption for the input audio, describing all notable speech, sound, and musical events comprehensively. In the caption, transcribe all spoken content by all speakers in the audio precisely.\nPlease think and reason about the input music before you respond.",
+            },
+            {
+                "type": "audio",
+                "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/videoplayback_superman.wav",
+            },
+        ],
+    }
+]
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+outputs = model.generate(**inputs, max_new_tokens=1024)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1] :], skip_special_tokens=True)
+print(decoded_outputs)
+```
 ### Training / Fine-tuning
 ```python