FaR-FT-PE / utils /commons.py
Antuke's picture
init
c69c4af
"""Contains functions used for loading and logging models """
import sys
import os
from transformers import AutoModel, AutoProcessor
import os
import core.vision_encoder.pe as pe
import core.vision_encoder.transforms as transforms_pe
from core.vision_encoder.config import PE_VISION_CONFIG
import torchvision.transforms as transforms
from PIL import Image
import requests
def print_trainable_params(model):
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
percent = (trainable_params / total_params * 100) if total_params > 0 else 0
print("\n--- Summary ---")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters: {total_params:,}")
print(f"Percentage: {percent:.2f}%")
def get_backbone_pe(version, print_info=False, apply_migration_flag=False,pretrained=True):
"""
Load PE ViT model, return model, transforms and size of output (dimension of embedding of last token)
"""
print(f'Loading {version}...')
backbone = pe.VisionTransformer.from_config(version, pretrained=pretrained)
backbone_config = PE_VISION_CONFIG[version]
transform = transforms_pe.get_image_transform_fix(image_size=backbone_config.image_size)
print("\nYou can ignore the Missing keys list above.")
print(f"Applying migration = {apply_migration_flag}")
if print_info:
attnpool= backbone.attn_pool
print(f'embed_dim={attnpool.embed_dim}\nnum_heads={attnpool.num_heads}')
print(f'OUTPUT DIM = {backbone_config.output_dim}')
def apply_migration(m):
if isinstance(m, pe.SelfAttention):
m.migrate_weights()
if apply_migration_flag == True: # when testing/resuming no migration should be used
print('[MIGRATION] Migrating weights for PEFT compatibiltyy')
backbone.apply(apply_migration)
return backbone, transform, backbone_config.output_dim
def get_backbone_dinov3(model_name: str="facebook/dinov3-vitb16-pretrain-lvd1689m", print_info=False):
print(f"Loading Hugging Face model: {model_name}")
processor = AutoProcessor.from_pretrained(model_name)
# Extract image processing configuration from the loaded processor
image_processor_config = processor
image_size = image_processor_config.size['height']
image_mean = image_processor_config.image_mean
image_std = image_processor_config.image_std
transform = transforms.Compose([
transforms.Lambda(_convert_to_rgb),
transforms.Resize((image_size, image_size), antialias=True),
transforms.ToTensor(),
transforms.Normalize(mean=image_mean, std=image_std)
])
# Load the model and return only the vision backbone
vision_model = AutoModel.from_pretrained(model_name)
if print_info:
print(f'\nVISION CONFIGS:\n{vision_model.config}')
print(f'\n\n\n{vision_model}')
return vision_model, transform, vision_model.config.hidden_size
def get_backbone_siglip2(model_name: str='google/siglip2-base-patch16-224',print_info=False):
"""
Load siglip2 ViT model, return model, transforms and size of output (dimension of embedding of last token)
"""
print(f"Loading Hugging Face model: {model_name}")
processor = AutoProcessor.from_pretrained(model_name)
# Extract image processing configuration from the loaded processor
image_processor_config = processor.image_processor
image_size = image_processor_config.size['height']
image_mean = image_processor_config.image_mean
image_std = image_processor_config.image_std
transform = transforms.Compose([
transforms.Lambda(_convert_to_rgb),
transforms.Resize((image_size, image_size), antialias=True),
transforms.ToTensor(),
transforms.Normalize(mean=image_mean, std=image_std)
])
# Load the model and return only the vision backbone
model = AutoModel.from_pretrained(model_name)
vision_model = model.vision_model
if print_info:
print(f'\nVISION CONFIGS:\n{vision_model.config}')
print(f'\n\n***************MHAP\n{vision_model.head}')
return vision_model, transform, vision_model.config.hidden_size
def _convert_to_rgb(image: Image.Image) -> Image.Image:
"""Converts a PIL Image to RGB format."""
return image.convert("RGB")
def get_backbone(version: str, apply_migration : bool = False):
"""
Returns vision transformer backbone
Args:
version: Name of the backbone to use, PE-Core or Siglip
ckpt: if different from null, loads backbone from .pt file specified, only for PE
"""
if 'PE-Core-' in version:
return get_backbone_pe(version, False, apply_migration)
elif 'siglip2' in version:
print('[LOADING SIGLIP2]')
return get_backbone_siglip2(version)
elif 'dinov3' in version:
return get_backbone_dinov3(version)
def send_telegram_message(message: str):
"""Sends a message to a Telegram chat using credentials from the config."""
# Get credentials from your config object. Use getattr for safety.
token = os.getenv("BOT_TOKEN")
chat_id = "1220514183"
if not token or not chat_id:
# Silently fail if credentials are not set
return
api_url = f"https://api.telegram.org/bot{token}/sendMessage"
payload = {
'chat_id': chat_id,
'text': message,
'parse_mode': 'Markdown' # For nice formatting (bold, italics, etc.)
}
try:
response = requests.post(api_url, data=payload, timeout=10)
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
except requests.exceptions.RequestException as e:
# Don't crash the training loop if Telegram is down
print(f"\nWarning: Could not send Telegram message. Error: {e}")