import base64
import torch
from transformers import InstructBlipForConditionalGeneration, InstructBlipTokenizer

class InstructBlipHandler:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, input_data):
        # Preprocess the input data
        inputs = self.preprocess(input_data)
        # Generate the output using the model
        outputs = self.model.generate(**inputs)
        # Postprocess the output
        result = self.postprocess(outputs)
        return result

    def preprocess(self, input_data):
        image_data = input_data["image"]
        text_prompt = input_data["text"]

        image = torch.tensor(base64.b64decode(image_data)).unsqueeze(0)
        text_inputs = self.tokenizer(text_prompt, return_tensors="pt")

        inputs = {
            "input_ids": text_inputs["input_ids"],
            "attention_mask": text_inputs["attention_mask"],
            "pixel_values": image
        }
        return inputs

    def postprocess(self, outputs):
        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)

model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-flan-t5-xl")
tokenizer = InstructBlipTokenizer.from_pretrained("Salesforce/instructblip-flan-t5-xl")
handler = InstructBlipHandler(model, tokenizer)