from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import requests
import re
class VQA:
    def __init__(self, gpu_number=0):
        use_load_8bit= False
        from transformers import AutoProcessor, InstructBlipForConditionalGeneration, InstructBlipProcessor

        
        self.model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        self.processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        self.model.eval()
        self.qa_prompt =  "Question: {} Short answer:"
        self.caption_prompt = "\n<image>\na photo of"
        self.max_words = 50
    
    def pre_question(self, question):
        # from LAVIS blip_processors
        question = re.sub(
            r"([.!\"()*#:;~])",
            "",
            question.lower(),
        )
        question = question.rstrip(" ")

        # truncate question
        question_words = question.split(" ")
        if len(question_words) > self.max_words:
            question = " ".join(question_words[: self.max_words])

        return question

    def qa(self, image_path, question):
        image = Image.open(image_path)
        question = self.pre_question(question)
        inputs = self.processor(images=image, text=question, return_tensors="pt", padding="longest").to(self.model.device)
        generated_ids = self.model.generate(**inputs, length_penalty=-1, num_beams=5, max_length=30, min_length=1,
                                            do_sample=False, top_p=0.9, repetition_penalty=1.0,
                                            num_return_sequences=1, temperature=1)
        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)

        return generated_text[0]