Spaces:
Running
Running
from transformers import BlipProcessor, BlipForQuestionAnswering | |
from PIL import Image | |
import requests | |
import re | |
class VQA: | |
def __init__(self, gpu_number=0): | |
use_load_8bit= False | |
from transformers import AutoProcessor, InstructBlipForConditionalGeneration, InstructBlipProcessor | |
self.model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto") | |
self.processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b") | |
self.model.eval() | |
self.qa_prompt = "Question: {} Short answer:" | |
self.caption_prompt = "\n<image>\na photo of" | |
self.max_words = 50 | |
def pre_question(self, question): | |
# from LAVIS blip_processors | |
question = re.sub( | |
r"([.!\"()*#:;~])", | |
"", | |
question.lower(), | |
) | |
question = question.rstrip(" ") | |
# truncate question | |
question_words = question.split(" ") | |
if len(question_words) > self.max_words: | |
question = " ".join(question_words[: self.max_words]) | |
return question | |
def qa(self, image_path, question): | |
image = Image.open(image_path) | |
question = self.pre_question(question) | |
inputs = self.processor(images=image, text=question, return_tensors="pt", padding="longest").to(self.model.device) | |
generated_ids = self.model.generate(**inputs, length_penalty=-1, num_beams=5, max_length=30, min_length=1, | |
do_sample=False, top_p=0.9, repetition_penalty=1.0, | |
num_return_sequences=1, temperature=1) | |
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True) | |
return generated_text[0] | |