File size: 5,900 Bytes
c69c4af |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
"""Contains functions used for loading and logging models """
import sys
import os
from transformers import AutoModel, AutoProcessor
import os
import core.vision_encoder.pe as pe
import core.vision_encoder.transforms as transforms_pe
from core.vision_encoder.config import PE_VISION_CONFIG
import torchvision.transforms as transforms
from PIL import Image
import requests
def print_trainable_params(model):
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
percent = (trainable_params / total_params * 100) if total_params > 0 else 0
print("\n--- Summary ---")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters: {total_params:,}")
print(f"Percentage: {percent:.2f}%")
def get_backbone_pe(version, print_info=False, apply_migration_flag=False,pretrained=True):
"""
Load PE ViT model, return model, transforms and size of output (dimension of embedding of last token)
"""
print(f'Loading {version}...')
backbone = pe.VisionTransformer.from_config(version, pretrained=pretrained)
backbone_config = PE_VISION_CONFIG[version]
transform = transforms_pe.get_image_transform_fix(image_size=backbone_config.image_size)
print("\nYou can ignore the Missing keys list above.")
print(f"Applying migration = {apply_migration_flag}")
if print_info:
attnpool= backbone.attn_pool
print(f'embed_dim={attnpool.embed_dim}\nnum_heads={attnpool.num_heads}')
print(f'OUTPUT DIM = {backbone_config.output_dim}')
def apply_migration(m):
if isinstance(m, pe.SelfAttention):
m.migrate_weights()
if apply_migration_flag == True: # when testing/resuming no migration should be used
print('[MIGRATION] Migrating weights for PEFT compatibiltyy')
backbone.apply(apply_migration)
return backbone, transform, backbone_config.output_dim
def get_backbone_dinov3(model_name: str="facebook/dinov3-vitb16-pretrain-lvd1689m", print_info=False):
print(f"Loading Hugging Face model: {model_name}")
processor = AutoProcessor.from_pretrained(model_name)
# Extract image processing configuration from the loaded processor
image_processor_config = processor
image_size = image_processor_config.size['height']
image_mean = image_processor_config.image_mean
image_std = image_processor_config.image_std
transform = transforms.Compose([
transforms.Lambda(_convert_to_rgb),
transforms.Resize((image_size, image_size), antialias=True),
transforms.ToTensor(),
transforms.Normalize(mean=image_mean, std=image_std)
])
# Load the model and return only the vision backbone
vision_model = AutoModel.from_pretrained(model_name)
if print_info:
print(f'\nVISION CONFIGS:\n{vision_model.config}')
print(f'\n\n\n{vision_model}')
return vision_model, transform, vision_model.config.hidden_size
def get_backbone_siglip2(model_name: str='google/siglip2-base-patch16-224',print_info=False):
"""
Load siglip2 ViT model, return model, transforms and size of output (dimension of embedding of last token)
"""
print(f"Loading Hugging Face model: {model_name}")
processor = AutoProcessor.from_pretrained(model_name)
# Extract image processing configuration from the loaded processor
image_processor_config = processor.image_processor
image_size = image_processor_config.size['height']
image_mean = image_processor_config.image_mean
image_std = image_processor_config.image_std
transform = transforms.Compose([
transforms.Lambda(_convert_to_rgb),
transforms.Resize((image_size, image_size), antialias=True),
transforms.ToTensor(),
transforms.Normalize(mean=image_mean, std=image_std)
])
# Load the model and return only the vision backbone
model = AutoModel.from_pretrained(model_name)
vision_model = model.vision_model
if print_info:
print(f'\nVISION CONFIGS:\n{vision_model.config}')
print(f'\n\n***************MHAP\n{vision_model.head}')
return vision_model, transform, vision_model.config.hidden_size
def _convert_to_rgb(image: Image.Image) -> Image.Image:
"""Converts a PIL Image to RGB format."""
return image.convert("RGB")
def get_backbone(version: str, apply_migration : bool = False):
"""
Returns vision transformer backbone
Args:
version: Name of the backbone to use, PE-Core or Siglip
ckpt: if different from null, loads backbone from .pt file specified, only for PE
"""
if 'PE-Core-' in version:
return get_backbone_pe(version, False, apply_migration)
elif 'siglip2' in version:
print('[LOADING SIGLIP2]')
return get_backbone_siglip2(version)
elif 'dinov3' in version:
return get_backbone_dinov3(version)
def send_telegram_message(message: str):
"""Sends a message to a Telegram chat using credentials from the config."""
# Get credentials from your config object. Use getattr for safety.
token = os.getenv("BOT_TOKEN")
chat_id = "1220514183"
if not token or not chat_id:
# Silently fail if credentials are not set
return
api_url = f"https://api.telegram.org/bot{token}/sendMessage"
payload = {
'chat_id': chat_id,
'text': message,
'parse_mode': 'Markdown' # For nice formatting (bold, italics, etc.)
}
try:
response = requests.post(api_url, data=payload, timeout=10)
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
except requests.exceptions.RequestException as e:
# Don't crash the training loop if Telegram is down
print(f"\nWarning: Could not send Telegram message. Error: {e}") |