Spaces:

Antuke
/

FaR-FT-PE

Sleeping

File size: 5,900 Bytes

c69c4af

"""Contains functions used for loading and logging models """

import sys
import os
from transformers import AutoModel, AutoProcessor 

import os
import core.vision_encoder.pe as pe
import core.vision_encoder.transforms as transforms_pe
from core.vision_encoder.config import PE_VISION_CONFIG
import torchvision.transforms as transforms
from PIL import Image
import requests


def print_trainable_params(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    percent = (trainable_params / total_params * 100) if total_params > 0 else 0
    print("\n--- Summary ---")
    print(f"Trainable parameters: {trainable_params:,}")
    print(f"Total parameters:     {total_params:,}")
    print(f"Percentage:           {percent:.2f}%")


def get_backbone_pe(version, print_info=False, apply_migration_flag=False,pretrained=True):
    """
    Load PE ViT model, return model, transforms and size of output (dimension of embedding of last token)
    """
    print(f'Loading {version}...')
    backbone = pe.VisionTransformer.from_config(version, pretrained=pretrained)
    backbone_config = PE_VISION_CONFIG[version]
    transform = transforms_pe.get_image_transform_fix(image_size=backbone_config.image_size)
   
    print("\nYou can ignore the Missing keys list above.")
    print(f"Applying migration = {apply_migration_flag}")
    
    if print_info:
        attnpool= backbone.attn_pool
        print(f'embed_dim={attnpool.embed_dim}\nnum_heads={attnpool.num_heads}')
        print(f'OUTPUT DIM = {backbone_config.output_dim}')

    def apply_migration(m):
        if isinstance(m, pe.SelfAttention):
            m.migrate_weights()

    if apply_migration_flag == True: # when testing/resuming no migration should be used
        print('[MIGRATION] Migrating weights for PEFT compatibiltyy')
        backbone.apply(apply_migration)

    return backbone, transform, backbone_config.output_dim


def get_backbone_dinov3(model_name: str="facebook/dinov3-vitb16-pretrain-lvd1689m", print_info=False):
    print(f"Loading Hugging Face model: {model_name}")
    processor = AutoProcessor.from_pretrained(model_name)

    # Extract image processing configuration from the loaded processor
    image_processor_config = processor
    image_size = image_processor_config.size['height']
    image_mean = image_processor_config.image_mean
    image_std = image_processor_config.image_std

    transform = transforms.Compose([
        transforms.Lambda(_convert_to_rgb),
        transforms.Resize((image_size, image_size), antialias=True),
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std)
    ])
    
    # Load the model and return only the vision backbone
    vision_model = AutoModel.from_pretrained(model_name)

    if print_info:
        print(f'\nVISION CONFIGS:\n{vision_model.config}')
        print(f'\n\n\n{vision_model}')


    return vision_model, transform, vision_model.config.hidden_size


def get_backbone_siglip2(model_name: str='google/siglip2-base-patch16-224',print_info=False):
    """
    Load siglip2 ViT model, return model, transforms and size of output (dimension of embedding of last token)
    """
    print(f"Loading Hugging Face model: {model_name}")
    processor = AutoProcessor.from_pretrained(model_name)


    # Extract image processing configuration from the loaded processor
    image_processor_config = processor.image_processor
    image_size = image_processor_config.size['height']
    image_mean = image_processor_config.image_mean
    image_std = image_processor_config.image_std

    transform = transforms.Compose([
        transforms.Lambda(_convert_to_rgb),
        transforms.Resize((image_size, image_size), antialias=True),
        transforms.ToTensor(),
        transforms.Normalize(mean=image_mean, std=image_std)
    ])
    
    # Load the model and return only the vision backbone
    model = AutoModel.from_pretrained(model_name)
    vision_model = model.vision_model

    if print_info:
        print(f'\nVISION CONFIGS:\n{vision_model.config}')
        print(f'\n\n***************MHAP\n{vision_model.head}')


    return vision_model, transform, vision_model.config.hidden_size

def _convert_to_rgb(image: Image.Image) -> Image.Image:
    """Converts a PIL Image to RGB format."""
    return image.convert("RGB")


def get_backbone(version: str, apply_migration : bool = False):
    """
    Returns vision transformer backbone
    Args:
        version: Name of the backbone to use, PE-Core or Siglip
        ckpt: if different from null, loads backbone from .pt file specified, only for PE
    """
    if 'PE-Core-' in version:
        return get_backbone_pe(version, False, apply_migration)
    elif 'siglip2' in version:
        print('[LOADING SIGLIP2]')
        return get_backbone_siglip2(version)
    elif 'dinov3' in version:
        return get_backbone_dinov3(version)
    


def send_telegram_message(message: str):
    """Sends a message to a Telegram chat using credentials from the config."""
    # Get credentials from your config object. Use getattr for safety.
    token = os.getenv("BOT_TOKEN")
    chat_id = "1220514183"

    if not token or not chat_id:
        # Silently fail if credentials are not set
        return

    api_url = f"https://api.telegram.org/bot{token}/sendMessage"
    payload = {
        'chat_id': chat_id,
        'text': message,
        'parse_mode': 'Markdown'  # For nice formatting (bold, italics, etc.)
    }

    try:
        response = requests.post(api_url, data=payload, timeout=10)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
    except requests.exceptions.RequestException as e:
        # Don't crash the training loop if Telegram is down
        print(f"\nWarning: Could not send Telegram message. Error: {e}")