| # Text-Conditional Diffusion Configuration | |
| IMAGE_SIZE = 64 | |
| BATCH_SIZE = 128 # 32 per GPU across 4 GPUs | |
| NUM_EPOCHS = 100 # Longer training for better text conditioning | |
| LEARNING_RATE = 1e-4 | |
| TIMESTEPS = 1000 | |
| # Model architecture | |
| CHANNELS = 256 # Maximum capacity for best feature learning | |
| TIME_DIM = 128 | |
| TEXT_DIM = 512 # CLIP embedding dimension | |
| # Text encoder | |
| CLIP_MODEL = "openai/clip-vit-base-patch32" | |
| FREEZE_CLIP = True # Freeze CLIP weights during training | |
| # Classifier-free guidance | |
| CFG_DROP_PROB = 0.15 # Increased to 15% for stronger conditioning contrast | |
| CFG_GUIDANCE_SCALE = 5.0 # Higher default guidance scale | |
| # Data | |
| DATASET_NAME = "Xenova/quickdraw-small" | |
| MAX_SAMPLES = None # Set to None to use all samples, or specify a number | |
| NUM_CLASSES_FILTER = 5 # Small set for clear proof-of-concept | |
| NUM_CLASSES = None # Will be set after loading dataset |