File size: 868 Bytes

35bd2c2

# Text-Conditional Diffusion Configuration
IMAGE_SIZE = 64
BATCH_SIZE = 128  # 32 per GPU across 4 GPUs
NUM_EPOCHS = 100  # Longer training for better text conditioning
LEARNING_RATE = 1e-4
TIMESTEPS = 1000

# Model architecture
CHANNELS = 256  # Maximum capacity for best feature learning
TIME_DIM = 128
TEXT_DIM = 512  # CLIP embedding dimension

# Text encoder
CLIP_MODEL = "openai/clip-vit-base-patch32"
FREEZE_CLIP = True  # Freeze CLIP weights during training

# Classifier-free guidance
CFG_DROP_PROB = 0.15  # Increased to 15% for stronger conditioning contrast
CFG_GUIDANCE_SCALE = 5.0  # Higher default guidance scale

# Data
DATASET_NAME = "Xenova/quickdraw-small"
MAX_SAMPLES = None  # Set to None to use all samples, or specify a number
NUM_CLASSES_FILTER = 5  # Small set for clear proof-of-concept
NUM_CLASSES = None  # Will be set after loading dataset