Spaces:

Antuke
/

FaR-FT-PE

Sleeping

App Files Files Community

FaR-FT-PE / utils /commons.py

Antuke

init

c69c4af about 1 month ago

raw

history blame contribute delete

5.9 kB

	"""Contains functions used for loading and logging models """

	import sys
	import os
	from transformers import AutoModel, AutoProcessor

	import os
	import core.vision_encoder.pe as pe
	import core.vision_encoder.transforms as transforms_pe
	from core.vision_encoder.config import PE_VISION_CONFIG
	import torchvision.transforms as transforms
	from PIL import Image
	import requests


	def print_trainable_params(model):
	total_params = sum(p.numel() for p in model.parameters())
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	percent = (trainable_params / total_params * 100) if total_params > 0 else 0
	print("\n--- Summary ---")
	print(f"Trainable parameters: {trainable_params:,}")
	print(f"Total parameters: {total_params:,}")
	print(f"Percentage: {percent:.2f}%")


	def get_backbone_pe(version, print_info=False, apply_migration_flag=False,pretrained=True):
	"""
	Load PE ViT model, return model, transforms and size of output (dimension of embedding of last token)
	"""
	print(f'Loading {version}...')
	backbone = pe.VisionTransformer.from_config(version, pretrained=pretrained)
	backbone_config = PE_VISION_CONFIG[version]
	transform = transforms_pe.get_image_transform_fix(image_size=backbone_config.image_size)

	print("\nYou can ignore the Missing keys list above.")
	print(f"Applying migration = {apply_migration_flag}")

	if print_info:
	attnpool= backbone.attn_pool
	print(f'embed_dim={attnpool.embed_dim}\nnum_heads={attnpool.num_heads}')
	print(f'OUTPUT DIM = {backbone_config.output_dim}')

	def apply_migration(m):
	if isinstance(m, pe.SelfAttention):
	m.migrate_weights()

	if apply_migration_flag == True: # when testing/resuming no migration should be used
	print('[MIGRATION] Migrating weights for PEFT compatibiltyy')
	backbone.apply(apply_migration)

	return backbone, transform, backbone_config.output_dim


	def get_backbone_dinov3(model_name: str="facebook/dinov3-vitb16-pretrain-lvd1689m", print_info=False):
	print(f"Loading Hugging Face model: {model_name}")
	processor = AutoProcessor.from_pretrained(model_name)

	# Extract image processing configuration from the loaded processor
	image_processor_config = processor
	image_size = image_processor_config.size['height']
	image_mean = image_processor_config.image_mean
	image_std = image_processor_config.image_std

	transform = transforms.Compose([
	transforms.Lambda(_convert_to_rgb),
	transforms.Resize((image_size, image_size), antialias=True),
	transforms.ToTensor(),
	transforms.Normalize(mean=image_mean, std=image_std)
	])

	# Load the model and return only the vision backbone
	vision_model = AutoModel.from_pretrained(model_name)

	if print_info:
	print(f'\nVISION CONFIGS:\n{vision_model.config}')
	print(f'\n\n\n{vision_model}')


	return vision_model, transform, vision_model.config.hidden_size


	def get_backbone_siglip2(model_name: str='google/siglip2-base-patch16-224',print_info=False):
	"""
	Load siglip2 ViT model, return model, transforms and size of output (dimension of embedding of last token)
	"""
	print(f"Loading Hugging Face model: {model_name}")
	processor = AutoProcessor.from_pretrained(model_name)


	# Extract image processing configuration from the loaded processor
	image_processor_config = processor.image_processor
	image_size = image_processor_config.size['height']
	image_mean = image_processor_config.image_mean
	image_std = image_processor_config.image_std

	transform = transforms.Compose([
	transforms.Lambda(_convert_to_rgb),
	transforms.Resize((image_size, image_size), antialias=True),
	transforms.ToTensor(),
	transforms.Normalize(mean=image_mean, std=image_std)
	])

	# Load the model and return only the vision backbone
	model = AutoModel.from_pretrained(model_name)
	vision_model = model.vision_model

	if print_info:
	print(f'\nVISION CONFIGS:\n{vision_model.config}')
	print(f'\n\n***************MHAP\n{vision_model.head}')


	return vision_model, transform, vision_model.config.hidden_size

	def _convert_to_rgb(image: Image.Image) -> Image.Image:
	"""Converts a PIL Image to RGB format."""
	return image.convert("RGB")


	def get_backbone(version: str, apply_migration : bool = False):
	"""
	Returns vision transformer backbone
	Args:
	version: Name of the backbone to use, PE-Core or Siglip
	ckpt: if different from null, loads backbone from .pt file specified, only for PE
	"""
	if 'PE-Core-' in version:
	return get_backbone_pe(version, False, apply_migration)
	elif 'siglip2' in version:
	print('[LOADING SIGLIP2]')
	return get_backbone_siglip2(version)
	elif 'dinov3' in version:
	return get_backbone_dinov3(version)



	def send_telegram_message(message: str):
	"""Sends a message to a Telegram chat using credentials from the config."""
	# Get credentials from your config object. Use getattr for safety.
	token = os.getenv("BOT_TOKEN")
	chat_id = "1220514183"

	if not token or not chat_id:
	# Silently fail if credentials are not set
	return

	api_url = f"https://api.telegram.org/bot{token}/sendMessage"
	payload = {
	'chat_id': chat_id,
	'text': message,
	'parse_mode': 'Markdown' # For nice formatting (bold, italics, etc.)
	}

	try:
	response = requests.post(api_url, data=payload, timeout=10)
	response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
	except requests.exceptions.RequestException as e:
	# Don't crash the training loop if Telegram is down
	print(f"\nWarning: Could not send Telegram message. Error: {e}")