| import torch |
| import os |
| from enum import Enum |
| from tqdm import tqdm |
| import numpy as np |
| from detectron2.structures import BitMasks |
| from objectrelator.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, \ |
| DEFAULT_IM_END_TOKEN, DEFAULT_SEG_TOKEN, SEG_TOKEN_INDEX |
| from objectrelator.model.builder import load_pretrained_model |
| from objectrelator.utils import disable_torch_init |
| from objectrelator.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria |
| import cv2 |
| from torch.utils.data import Dataset, DataLoader |
| from objectrelator import conversation as conversation_lib |
| from objectrelator.train.train_datasets import COCO_interactive_dataset_eval, COCO_interactive_dataset_train, COCO_interactive_dataset |
| from detectron2.structures import BoxMode |
| from detectron2.data import MetadataCatalog, DatasetCatalog |
| from typing import Dict, Optional, Sequence, List |
| from dataclasses import dataclass, field |
| import torch.distributed as dist |
| import transformers |
| from pathlib import Path |
| from objectrelator.eval.segmentation_evaluation import openseg_classes |
| COLOR_MAP = openseg_classes.ADE20K_150_CATEGORIES |
| import re |
| from objectrelator.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, SEG_TOKEN_INDEX, CLS_TOKEN_INDEX, REGION_TOKEN_INDEX, REFER_TOKEN_INDEX |
|
|
| ''' |
| Create custom dataset classes for EgoExo and Handal datasets |
| ''' |
|
|
|
|
| class EgoExo_Dataset_eval(COCO_interactive_dataset_eval): |
|
|
| def preprocess_referring_instruction(self,instruction, REFER_token='[SEG]'): |
| tokenized = self.tokenizer.encode(instruction, add_special_tokens=False) |
| tokenized = tokenized + [self.tokenizer.encode(REFER_token, add_special_tokens=False)[0]] |
|
|
| token_refer_id = torch.tensor(tokenized) |
|
|
| return token_refer_id |
|
|
| def tokenizer_special_tokens(self, prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, |
| seg_token_index=SEG_TOKEN_INDEX, cls_token_index=CLS_TOKEN_INDEX, |
| region_token_index=REGION_TOKEN_INDEX,refer_token_index=REFER_TOKEN_INDEX, return_tensors=None): |
| input_ids = [] |
| special_token_map = {'<image>': image_token_index, '<seg>': seg_token_index, '<cls>': cls_token_index, '<region>':region_token_index, '<refer>':refer_token_index} |
| prompt_chunks = re.split('(<image>|<seg>|<cls>|<region>|<refer>)', prompt) |
|
|
| for chunk in prompt_chunks: |
| if chunk in special_token_map: |
| input_ids.append(special_token_map[chunk]) |
| else: |
| input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False)) |
| if return_tensors is not None: |
| if return_tensors == 'pt': |
| return torch.tensor(input_ids, dtype=torch.long).squeeze() |
| raise ValueError(f'Unsupported tensor type: {return_tensors}') |
| else: |
| return input_ids |
|
|
| def __getitem__(self, idx): |
| data = self.data[idx] |
| image_file = data['image'] |
| image_folder = self.data_args.image_folder |
|
|
| data_dict = {} |
| data_dict['file_name'] = os.path.join(image_folder, image_file) |
| data_dict['height'] = data['image_info']['height'] |
| data_dict['width'] = data['image_info']['width'] |
| data_dict['image_id'] = data['new_img_id'] |
| data_dict['annotations'] = data['anns'] |
| data_dict['vp_annotations'] = data['first_frame_anns'] |
| data_dict['vp_image'] = os.path.join(image_folder,data['first_frame_image']) |
| for annotation in data_dict['annotations']: |
| annotation['bbox_mode'] = BoxMode.XYXY_ABS |
| annotation['bbox'] = [0,0,0,0] |
| annotation['image_id'] = data['new_img_id'] |
| for annotation in data_dict['vp_annotations']: |
| annotation['bbox_mode'] = BoxMode.XYXY_ABS |
| annotation['bbox'] = [0,0,0,0] |
| annotation['image_id'] = data['new_img_id'] |
|
|
| if isinstance(self.data_args.image_processor,dict): |
| processor = self.data_args.image_processor['instance'] |
| else: |
| processor = self.data_args.image_processor |
| region_mask_type = getattr(self.data_args,'region_mask_type',None) |
| if region_mask_type is not None: |
| region_mask_type = region_mask_type.split('||') |
| data_dict = processor.preprocess(data_dict,region_mask_type=region_mask_type,mask_format='bitmask') |
| sentences = data['instruction'] |
| num_target = len(data_dict['instances']) |
| |
| prefix_inst = 'This is an image <image>, Please segment by given regions and instruction' |
| instruction = '' |
| for sent in sentences: |
| instruction += ' {}.'.format(sent['sent']) |
| regions_inst = ' <region>,' * (num_target - 1) + ' <region>.' |
| sources_value = f'\nThis is all regions: {regions_inst}\n' |
| sources = [[{'from': 'human', 'value': prefix_inst + sources_value + "and this is the instruction: " + '<refer>\n'}, |
| {'from': 'gpt', 'value': '\n[SEG]<seg>'}]] |
| text_dict = self.preprocess_llama2(sources, self.tokenizer) |
| input_ids = text_dict['input_ids'][0] |
| labels = text_dict['labels'][0] |
| token_refer_id = self.preprocess_referring_instruction(instruction) |
| refer_embedding_indices = torch.zeros_like(input_ids) |
| refer_embedding_indices[input_ids == REFER_TOKEN_INDEX] = 1 |
| |
| data_dict['input_ids'] = input_ids |
| data_dict['labels'] = labels |
| data_dict['dataset_type'] = 'referring_coco' |
| data_dict['token_refer_id'] = token_refer_id |
| data_dict['refer_embedding_indices'] = refer_embedding_indices |
| return data_dict |
| |
|
|
| class EgoExo_Dataset_train(COCO_interactive_dataset_train): |
|
|
| def preprocess_referring_instruction(self,instruction, REFER_token='[SEG]'): |
| tokenized = self.tokenizer.encode(instruction, add_special_tokens=False) |
| tokenized = tokenized + [self.tokenizer.encode(REFER_token, add_special_tokens=False)[0]] |
|
|
| token_refer_id = torch.tensor(tokenized) |
|
|
| return token_refer_id |
| |
| def tokenizer_special_tokens(self, prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, |
| seg_token_index=SEG_TOKEN_INDEX, cls_token_index=CLS_TOKEN_INDEX, |
| region_token_index=REGION_TOKEN_INDEX,refer_token_index=REFER_TOKEN_INDEX, return_tensors=None): |
| input_ids = [] |
| special_token_map = {'<image>': image_token_index, '<seg>': seg_token_index, '<cls>': cls_token_index, '<region>':region_token_index, '<refer>':refer_token_index} |
| prompt_chunks = re.split('(<image>|<seg>|<cls>|<region>|<refer>)', prompt) |
|
|
| for chunk in prompt_chunks: |
| if chunk in special_token_map: |
| input_ids.append(special_token_map[chunk]) |
| else: |
| input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False)) |
| if return_tensors is not None: |
| if return_tensors == 'pt': |
| return torch.tensor(input_ids, dtype=torch.long).squeeze() |
| raise ValueError(f'Unsupported tensor type: {return_tensors}') |
| else: |
| return input_ids |
|
|
| def __getitem__(self, idx): |
| data = self.data[idx] |
| image_file = data['image'] |
| image_folder = self.data_args.image_folder |
|
|
|
|
| data_dict = {} |
| data_dict['file_name'] = os.path.join(image_folder, image_file) |
| data_dict['height'] = data['image_info']['height'] |
| data_dict['width'] = data['image_info']['width'] |
| data_dict['image_id'] = data['new_img_id'] |
| data_dict['annotations'] = data['anns'] |
| data_dict['vp_annotations'] = data['first_frame_anns'] |
| data_dict['vp_image'] = os.path.join(image_folder,data['first_frame_image']) |
| for annotation in data_dict['annotations']: |
| annotation['bbox_mode'] = BoxMode.XYXY_ABS |
| annotation['bbox'] = [0,0,0,0] |
| annotation['image_id'] = data['new_img_id'] |
| for annotation in data_dict['vp_annotations']: |
| annotation['bbox_mode'] = BoxMode.XYXY_ABS |
| annotation['bbox'] = [0,0,0,0] |
| annotation['image_id'] = data['new_img_id'] |
|
|
| if isinstance(self.data_args.image_processor,dict): |
| processor = self.data_args.image_processor['instance'] |
| else: |
| processor = self.data_args.image_processor |
| region_mask_type = getattr(self.data_args,'region_mask_type',None) |
| if region_mask_type is not None: |
| region_mask_type = region_mask_type.split('||') |
| data_dict = processor.preprocess(data_dict,region_mask_type=region_mask_type,mask_format='bitmask') |
| sentences = data['instruction'] |
| num_target = len(data_dict['instances']) |
| prefix_inst = 'This is an image <image>, Please segment by given regions and instruction' |
| instruction = '' |
| for sent in sentences: |
| instruction += ' {}.'.format(sent['sent']) |
| regions_inst = ' <region>,' * (num_target - 1) + ' <region>.' |
| sources_value = f'\nThis is all regions: {regions_inst}\n' |
| sources = [[{'from': 'human', 'value': prefix_inst + sources_value + "and this is the instruction: " + '<refer>\n'}, |
| {'from': 'gpt', 'value': '\n[SEG]<seg>'}]] |
| |
| text_dict = self.preprocess_llama2(sources, self.tokenizer) |
| input_ids = text_dict['input_ids'][0] |
| labels = text_dict['labels'][0] |
|
|
| token_refer_id = self.preprocess_referring_instruction(instruction) |
| refer_embedding_indices = torch.zeros_like(input_ids) |
| refer_embedding_indices[input_ids == REFER_TOKEN_INDEX] = 1 |
| |
| data_dict['input_ids'] = input_ids |
| data_dict['labels'] = labels |
| data_dict['dataset_type'] = 'referring_coco' |
|
|
| data_dict['token_refer_id'] = token_refer_id |
| data_dict['refer_embedding_indices'] = refer_embedding_indices |
| return data_dict |
|
|
| class Handal_Dataset_eval(COCO_interactive_dataset): |
| def preprocess_referring_instruction(self,instruction, REFER_token='[SEG]'): |
| tokenized = self.tokenizer.encode(instruction, add_special_tokens=False) |
| tokenized = tokenized + [self.tokenizer.encode(REFER_token, add_special_tokens=False)[0]] |
|
|
| token_refer_id = torch.tensor(tokenized) |
|
|
| return token_refer_id |
| |
| def tokenizer_special_tokens(self, prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, |
| seg_token_index=SEG_TOKEN_INDEX, cls_token_index=CLS_TOKEN_INDEX, |
| region_token_index=REGION_TOKEN_INDEX,refer_token_index=REFER_TOKEN_INDEX, return_tensors=None): |
| input_ids = [] |
| special_token_map = {'<image>': image_token_index, '<seg>': seg_token_index, '<cls>': cls_token_index, '<region>':region_token_index, '<refer>':refer_token_index} |
| prompt_chunks = re.split('(<image>|<seg>|<cls>|<region>|<refer>)', prompt) |
|
|
| for chunk in prompt_chunks: |
| if chunk in special_token_map: |
| input_ids.append(special_token_map[chunk]) |
| else: |
| input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False)) |
| if return_tensors is not None: |
| if return_tensors == 'pt': |
| return torch.tensor(input_ids, dtype=torch.long).squeeze() |
| raise ValueError(f'Unsupported tensor type: {return_tensors}') |
| else: |
| return input_ids |
|
|
| def __getitem__(self, idx): |
| data = self.data[idx] |
| image_file = data['image'] |
| image_folder = self.data_args.image_folder |
|
|
|
|
| data_dict = {} |
| data_dict['file_name'] = os.path.join(image_folder, image_file) |
| data_dict['height'] = data['image_info']['height'] |
| data_dict['width'] = data['image_info']['width'] |
| data_dict['image_id'] = data['new_img_id'] |
| data_dict['annotations'] = data['anns'] |
| data_dict['vp_annotations'] = data['first_frame_anns'] |
| data_dict['vp_image'] = os.path.join(image_folder,data['first_frame_image']) |
| for annotation in data_dict['annotations']: |
| annotation['bbox_mode'] = BoxMode.XYXY_ABS |
| annotation['bbox'] = [0,0,0,0] |
| annotation['image_id'] = data['new_img_id'] |
| for annotation in data_dict['vp_annotations']: |
| annotation['bbox_mode'] = BoxMode.XYXY_ABS |
| annotation['bbox'] = [0,0,0,0] |
| annotation['image_id'] = data['new_img_id'] |
|
|
| if isinstance(self.data_args.image_processor,dict): |
| processor = self.data_args.image_processor['instance'] |
| else: |
| processor = self.data_args.image_processor |
| region_mask_type = getattr(self.data_args,'region_mask_type',None) |
| if region_mask_type is not None: |
| region_mask_type = region_mask_type.split('||') |
| data_dict = processor.preprocess(data_dict,region_mask_type=region_mask_type,mask_format='bitmask') |
| sentences = data['instruction'] |
| num_target = len(data_dict['instances']) |
| prefix_inst = 'This is an image <image>, Please segment by given regions and instruction' |
| instruction = '' |
| for sent in sentences: |
| instruction += ' {}.'.format(sent['sent']) |
| regions_inst = ' <region>,' * (num_target - 1) + ' <region>.' |
| sources_value = f'\nThis is all regions: {regions_inst}\n' |
| sources = [[{'from': 'human', 'value': prefix_inst + sources_value + "and this is the instruction: " + '<refer>\n'}, |
| {'from': 'gpt', 'value': '\n[SEG]<seg>'}]] |
| |
| text_dict = self.preprocess_llama2(sources, self.tokenizer) |
| input_ids = text_dict['input_ids'][0] |
| labels = text_dict['labels'][0] |
|
|
| token_refer_id = self.preprocess_referring_instruction(instruction) |
| refer_embedding_indices = torch.zeros_like(input_ids) |
| refer_embedding_indices[input_ids == REFER_TOKEN_INDEX] = 1 |
| |
| data_dict['input_ids'] = input_ids |
| data_dict['labels'] = labels |
| data_dict['dataset_type'] = 'referring_coco' |
|
|
| data_dict['token_refer_id'] = token_refer_id |
| data_dict['refer_embedding_indices'] = refer_embedding_indices |
| return data_dict |
| |
| class Handal_Dataset_train(COCO_interactive_dataset_train): |
| def preprocess_referring_instruction(self,instruction, REFER_token='[SEG]'): |
| tokenized = self.tokenizer.encode(instruction, add_special_tokens=False) |
| tokenized = tokenized + [self.tokenizer.encode(REFER_token, add_special_tokens=False)[0]] |
|
|
| token_refer_id = torch.tensor(tokenized) |
|
|
| return token_refer_id |
| |
| def tokenizer_special_tokens(self, prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, |
| seg_token_index=SEG_TOKEN_INDEX, cls_token_index=CLS_TOKEN_INDEX, |
| region_token_index=REGION_TOKEN_INDEX,refer_token_index=REFER_TOKEN_INDEX, return_tensors=None): |
| input_ids = [] |
| special_token_map = {'<image>': image_token_index, '<seg>': seg_token_index, '<cls>': cls_token_index, '<region>':region_token_index, '<refer>':refer_token_index} |
| prompt_chunks = re.split('(<image>|<seg>|<cls>|<region>|<refer>)', prompt) |
|
|
| for chunk in prompt_chunks: |
| if chunk in special_token_map: |
| input_ids.append(special_token_map[chunk]) |
| else: |
| input_ids.extend(tokenizer.encode(chunk, add_special_tokens=False)) |
| if return_tensors is not None: |
| if return_tensors == 'pt': |
| return torch.tensor(input_ids, dtype=torch.long).squeeze() |
| raise ValueError(f'Unsupported tensor type: {return_tensors}') |
| else: |
| return input_ids |
|
|
| def __getitem__(self, idx): |
| data = self.data[idx] |
| image_file = data['image'] |
| image_folder = self.data_args.image_folder |
|
|
|
|
| data_dict = {} |
| data_dict['file_name'] = os.path.join(image_folder, image_file) |
| data_dict['height'] = data['image_info']['height'] |
| data_dict['width'] = data['image_info']['width'] |
| data_dict['image_id'] = data['new_img_id'] |
| data_dict['annotations'] = data['anns'] |
| data_dict['vp_annotations'] = data['first_frame_anns'] |
| data_dict['vp_image'] = os.path.join(image_folder,data['first_frame_image']) |
| for annotation in data_dict['annotations']: |
| annotation['bbox_mode'] = BoxMode.XYXY_ABS |
| annotation['bbox'] = [0,0,0,0] |
| annotation['image_id'] = data['new_img_id'] |
| for annotation in data_dict['vp_annotations']: |
| annotation['bbox_mode'] = BoxMode.XYXY_ABS |
| annotation['bbox'] = [0,0,0,0] |
| annotation['image_id'] = data['new_img_id'] |
|
|
| if isinstance(self.data_args.image_processor,dict): |
| processor = self.data_args.image_processor['instance'] |
| else: |
| processor = self.data_args.image_processor |
| region_mask_type = getattr(self.data_args,'region_mask_type',None) |
| if region_mask_type is not None: |
| region_mask_type = region_mask_type.split('||') |
| data_dict = processor.preprocess(data_dict,region_mask_type=region_mask_type,mask_format='bitmask') |
| sentences = data['instruction'] |
| num_target = len(data_dict['instances']) |
| prefix_inst = 'This is an image <image>, Please segment by given regions and instruction' |
| instruction = '' |
| for sent in sentences: |
| instruction += ' {}.'.format(sent['sent']) |
| regions_inst = ' <region>,' * (num_target - 1) + ' <region>.' |
| sources_value = f'\nThis is all regions: {regions_inst}\n' |
| sources = [[{'from': 'human', 'value': prefix_inst + sources_value + "and this is the instruction: " + '<refer>\n'}, |
| {'from': 'gpt', 'value': '\n[SEG]<seg>'}]] |
| |
| text_dict = self.preprocess_llama2(sources, self.tokenizer) |
| input_ids = text_dict['input_ids'][0] |
| labels = text_dict['labels'][0] |
|
|
| token_refer_id = self.preprocess_referring_instruction(instruction) |
| refer_embedding_indices = torch.zeros_like(input_ids) |
| refer_embedding_indices[input_ids == REFER_TOKEN_INDEX] = 1 |
| |
| data_dict['input_ids'] = input_ids |
| data_dict['labels'] = labels |
| data_dict['dataset_type'] = 'referring_coco' |
|
|
| data_dict['token_refer_id'] = token_refer_id |
| data_dict['refer_embedding_indices'] = refer_embedding_indices |
| return data_dict |