ritikraj2425 commited on
Commit
381836a
·
1 Parent(s): 21cb85d

Initial commit: diffusion demo

Browse files
Files changed (5) hide show
  1. app.py +98 -0
  2. diffusion_model.pth +3 -0
  3. requirements.txt +3 -0
  4. subword_tokenizer.json +0 -0
  5. train.py +131 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from tokenizers import Tokenizer
5
+ from train import MaskedDiffusionModel
6
+
7
+ # --- Setup & Loading ---
8
+ def load_model_and_tokenizer():
9
+ device = torch.device("cpu")
10
+ tokenizer = Tokenizer.from_file("subword_tokenizer.json")
11
+ vocab = tokenizer.get_vocab()
12
+ id2word = {int(v): k for k, v in vocab.items()}
13
+
14
+ # Initialize with your last known settings
15
+ model = MaskedDiffusionModel(vocab_size=len(vocab), d_model=768, nhead=12, num_layers=12, max_seq_len=128).to(device)
16
+ model.load_state_dict(torch.load("diffusion_model.pth", map_location=device))
17
+ model.eval()
18
+ return model, tokenizer, id2word, device
19
+
20
+ model, tokenizer, id2word, device = load_model_and_tokenizer()
21
+
22
+ def decode_tensor(tensor, id2word):
23
+ words = [id2word.get(token_id, "[UNK]") for token_id in tensor.tolist()]
24
+ return " ".join(words).replace(" ##", "").replace("##", "")
25
+
26
+ # --- Inference Generator ---
27
+ def generate_step_by_step(prompt):
28
+ max_len = 15
29
+ steps = 15
30
+ temp = 0.1
31
+
32
+ bos_id = tokenizer.token_to_id("[BOS]")
33
+ eos_id = tokenizer.token_to_id("[EOS]")
34
+ mask_id = tokenizer.token_to_id("[MASK]")
35
+
36
+ formatted_prompt = f"user: {prompt} bot:"
37
+ input_ids = tokenizer.encode(formatted_prompt).ids
38
+
39
+ sequence = [bos_id] + input_ids + [mask_id] * max_len + [eos_id]
40
+ seq_tensor = torch.tensor([sequence], dtype=torch.long, device=device)
41
+
42
+ mask_indices = (seq_tensor == mask_id).squeeze(0).nonzero(as_tuple=True)[0]
43
+
44
+ output_log = f"**Prompt:** {prompt}\n\n**Iterative Denoising Process:**\n"
45
+ output_log += f"`Step 00: {decode_tensor(seq_tensor[0], id2word)}`\n"
46
+ yield output_log
47
+
48
+ for step in range(1, steps + 1):
49
+ with torch.no_grad():
50
+ logits = model(seq_tensor)
51
+
52
+ probs = F.softmax(logits / max(temp, 1e-6), dim=-1)
53
+ predicted_ids = torch.multinomial(probs.view(-1, probs.size(-1)), 1).view(probs.shape[:-1]).squeeze(0)
54
+
55
+ true_probs = F.softmax(logits, dim=-1).squeeze(0)
56
+ confidences = torch.gather(true_probs, 1, predicted_ids.unsqueeze(1)).squeeze(1)
57
+
58
+ target_unmasked = int(len(mask_indices) * (step / steps))
59
+ current_seq = seq_tensor.squeeze(0).clone()
60
+
61
+ for idx in mask_indices:
62
+ current_seq[idx] = predicted_ids[idx]
63
+
64
+ if step < steps:
65
+ gen_conf = confidences[mask_indices]
66
+ num_remask = len(mask_indices) - target_unmasked
67
+ if num_remask > 0:
68
+ _, low_conf_idx = torch.topk(gen_conf, k=num_remask, largest=False)
69
+ for idx in mask_indices[low_conf_idx]:
70
+ current_seq[idx] = mask_id
71
+
72
+ seq_tensor = current_seq.unsqueeze(0)
73
+ output_log += f"`Step {step:02d}: {decode_tensor(seq_tensor[0], id2word)}`\n"
74
+ yield output_log
75
+
76
+ response_ids = seq_tensor[0][1 + len(input_ids) : -1]
77
+ final_text = decode_tensor(response_ids, id2word)
78
+ output_log += f"\n**Final Output:** {final_text}"
79
+ yield output_log
80
+
81
+ # --- Gradio UI ---
82
+ description_text = """
83
+ ### Experimental Discrete Diffusion Language Model (Proof of Concept)
84
+ **Note to Reviewers:** This is an early-stage checkpoint of a 110M parameter Transformer trained from scratch using a Masked Discrete Diffusion objective. Due to compute access ending before convergence (Loss ~ 2.0), the final English generation is fragmented.
85
+
86
+ However, this demo successfully showcases the **Iterative Denoising Algorithm**. Watch how the model transitions from pure `[MASK]` tokens to predicted vocabulary over 15 discrete diffusion steps!
87
+ """
88
+
89
+ demo = gr.Interface(
90
+ fn=generate_step_by_step,
91
+ inputs=gr.Textbox(label="Enter a short prompt (e.g., 'hi')"),
92
+ outputs=gr.Markdown(label="Diffusion Denoising Steps"),
93
+ title="Discrete Diffusion NLP Demo",
94
+ description=description_text
95
+ )
96
+
97
+ if __name__ == "__main__":
98
+ demo.launch()
diffusion_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f71ef132ac216aea6033adb80749c8d23ced2d55fd1180234b306a2eb89c1957
3
+ size 364368167
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ tokenizers
3
+ gradio
subword_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
train.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from torch.optim import AdamW
6
+ from dataset import create_dataloader # Ensure your dataset.py is in the same folder
7
+ import math
8
+
9
+ # --- Architecture Components ---
10
+
11
+ class PositionalEncoding(nn.Module):
12
+ def __init__(self, d_model, max_len=5000):
13
+ super().__init__()
14
+ pe = torch.zeros(max_len, d_model)
15
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
16
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
17
+ pe[:, 0::2] = torch.sin(position * div_term)
18
+ pe[:, 1::2] = torch.cos(position * div_term)
19
+ self.register_buffer('pe', pe.unsqueeze(0))
20
+
21
+ def forward(self, x):
22
+ return x + self.pe[:, :x.size(1), :]
23
+
24
+ class MaskedDiffusionModel(nn.Module):
25
+ def __init__(self, vocab_size, d_model=768, nhead=12, num_layers=12, max_seq_len=128):
26
+ super().__init__()
27
+ self.embedding = nn.Embedding(vocab_size, d_model)
28
+ self.pos_encoder = PositionalEncoding(d_model, max_len=max_seq_len)
29
+
30
+ encoder_layer = nn.TransformerEncoderLayer(
31
+ d_model=d_model,
32
+ nhead=nhead,
33
+ dim_feedforward=d_model * 4,
34
+ batch_first=True,
35
+ activation="gelu"
36
+ )
37
+ self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
38
+ self.fc_out = nn.Linear(d_model, vocab_size)
39
+
40
+ def forward(self, x):
41
+ x = self.embedding(x)
42
+ x = self.pos_encoder(x)
43
+ out = self.transformer(x)
44
+ return self.fc_out(out)
45
+
46
+ # --- Training Logic ---
47
+
48
+ def apply_forward_masking(x_0, mask_id, special_ids):
49
+ batch_size, seq_len = x_0.shape
50
+ device = x_0.device
51
+ t = torch.rand(1).item()
52
+ t = max(t, 0.1) # Minimum 10% masking for better learning
53
+
54
+ rand_probs = torch.rand((batch_size, seq_len), device=device)
55
+ is_special = torch.isin(x_0, torch.tensor(special_ids, device=device))
56
+ is_mask = (rand_probs < t) & (~is_special)
57
+
58
+ x_t = x_0.clone()
59
+ x_t[is_mask] = mask_id
60
+ return x_t, is_mask, t
61
+
62
+ def count_parameters(model):
63
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
64
+
65
+ def train_model():
66
+ # 1. SETUP DEVICE (NVIDIA CUDA)
67
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
68
+ print(f"🚀 Training on: {device}")
69
+
70
+ # 2. LOAD VOCAB
71
+ with open("subword_tokenizer.json", "r", encoding="utf-8") as f:
72
+ vocab_data = json.load(f)
73
+ vocab = vocab_data["model"]["vocab"]
74
+
75
+ vocab_size = len(vocab)
76
+ mask_id = vocab["[MASK]"]
77
+ special_ids = [vocab["[PAD]"], vocab["[BOS]"], vocab["[EOS]"], vocab["[UNK]"]]
78
+
79
+ # 3. INITIALIZE MODEL (MAX POWER VALUES)
80
+ model = MaskedDiffusionModel(
81
+ vocab_size=vocab_size,
82
+ d_model=768,
83
+ nhead=12,
84
+ num_layers=12,
85
+ max_seq_len=128
86
+ ).to(device)
87
+
88
+ # PRINT PARAMETER COUNT
89
+ print(f" Model Capacity: {count_parameters(model):,} parameters")
90
+
91
+ # 4. OPTIMIZER & DATALOADER
92
+ # Larger batch size for NVIDIA GPUs
93
+ dataloader, _ = create_dataloader("tokenized_data.json", batch_size=64)
94
+ optimizer = AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)
95
+
96
+ epochs = 200
97
+ print(f"Starting training for {epochs} epochs...")
98
+
99
+ for epoch in range(epochs):
100
+ model.train()
101
+ total_raw_ce = 0
102
+
103
+ for x_0 in dataloader:
104
+ x_0 = x_0.to(device)
105
+ optimizer.zero_grad()
106
+
107
+ x_t, is_mask, t = apply_forward_masking(x_0, mask_id, special_ids)
108
+ logits = model(x_t)
109
+
110
+ # Loss Calculation
111
+ loss_per_token = F.cross_entropy(logits.view(-1, vocab_size), x_0.view(-1), reduction='none').view_as(x_0)
112
+ masked_loss = (loss_per_token * is_mask.float()).sum() / (is_mask.sum() + 1e-8)
113
+
114
+ # Diffusion scaling
115
+ scaled_loss = masked_loss * min(1.0 / (t + 1e-5), 5.0)
116
+
117
+ scaled_loss.backward()
118
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
119
+ optimizer.step()
120
+
121
+ total_raw_ce += masked_loss.item()
122
+
123
+ avg_error = total_raw_ce / len(dataloader)
124
+ if (epoch + 1) % 5 == 0:
125
+ print(f"Epoch {epoch+1}/{epochs} | True Error (CE): {avg_error:.4f}")
126
+
127
+ torch.save(model.state_dict(), "diffusion_model.pth")
128
+ print("✅ Training complete! Weights saved.")
129
+
130
+ if __name__ == "__main__":
131
+ train_model()