Spaces:

detect-tech
/

RishiTest

Sleeping

App Files Files Community

RishiTest / models /InternVL3 /intervl3.py

Lemorra

🎨 Syncing with keyword obtained from IVA

4d4aec6 8 days ago

raw

history blame contribute delete

3.55 kB

	from transformers import AutoModel, AutoTokenizer
	import torch
	from payload_model import PayloadModel
	from internvl_utils import load_image
	from pydantic import BaseModel, Field
	from typing import Optional
	import PIL
	class InternVL3(BaseModel):
	model_name: str
	model: Optional[AutoModel] = None
	tokenizer: Optional[AutoTokenizer] = None
	generation_config: dict = Field(default_factory=lambda: {"max_new_tokens": 1024, "do_sample": True})

	model_config = {
	"arbitrary_types_allowed": True,
	"from_attributes": True
	}

	def __init__(self, model_name: str, **kwargs):
	super().__init__(model_name=model_name, **kwargs)
	self.model = AutoModel.from_pretrained(
	self.model_name,
	torch_dtype=torch.bfloat16,
	load_in_8bit=False,
	low_cpu_mem_usage=True,
	use_flash_attn=True,
	trust_remote_code=True,
	device_map="cuda" if torch.cuda.is_available() else "cpu",
	).eval()
	self.tokenizer = AutoTokenizer.from_pretrained(
	self.model_name,
	trust_remote_code=True,
	use_fast=False,
	)

	def get_query_prompt(self, prompt_keyword: str):
	if prompt_keyword.lower() == "person_running":
	query_prompt = """
	<image>\nCheck if person is running or not? If they are running
	respond with "Yes" else respond with "No". Limit your response to either "Yes" or "No"
	"""
	else:
	query_prompt = None
	return query_prompt

	def predict(self, pil_image: PIL.Image.Image, prompt_keyword: str):
	pixel_values = load_image(pil_image)
	query_prompt = self.get_query_prompt(prompt_keyword)
	if query_prompt is None:
	model_response = f"Invalid prompt keyword: {prompt_keyword}"
	else:
	model_response = self.model.chat(
	self.tokenizer,
	pixel_values,
	query_prompt,
	generation_config=self.generation_config,
	)

	return model_response

	def eval_or(self, images: list[PIL.Image.Image], prompt_keyword: str):
	model_responses = []
	for image in images:
	model_response = self.predict(image, prompt_keyword)
	model_responses.append(model_response)
	if self.extract_model_response(model_response):
	return True, model_responses
	return False, model_responses

	def eval_and(self, images: list[PIL.Image.Image], prompt_keyword: str):
	model_responses = []
	for image in images:
	model_response = self.predict(image, prompt_keyword)
	model_responses.append(model_response)
	if not self.extract_model_response(model_response):
	return False, model_responses
	return True, model_responses

	def extract_model_response(self, model_response: str):
	return "Yes" in model_response

	async def __call__(self, images: list[PIL.Image.Image], prompt_keyword: str, prompt_eval_mode: str):
	overall_response = False
	if prompt_eval_mode == "or":
	overall_response, model_responses = self.eval_or(images, prompt_keyword)
	elif prompt_eval_mode == "and":
	overall_response, model_responses = self.eval_and(images, prompt_keyword)
	else:
	raise ValueError(f"Invalid prompt eval mode: {prompt_eval_mode}")

	print(f"Model responses: {model_responses}")

	return overall_response