Spaces:
Sleeping
Sleeping
from huggingface_hub import login | |
import os | |
from peft import PeftModel, PeftConfig | |
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration | |
from PIL import Image | |
import requests | |
import torch | |
import io | |
import base64 | |
import cv2 | |
access_token = os.environ["HF_TOKEN"] | |
login(token=access_token) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
dtype = torch.bfloat16 | |
config = PeftConfig.from_pretrained("anushettypsl/paligemma_vqav2") | |
# base_model = AutoModelForCausalLM.from_pretrained("google/paligemma-3b-pt-448") | |
base_model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma-3b-pt-448") | |
model = PeftModel.from_pretrained(base_model, "anushettypsl/paligemma_vqav2", device_map=device) | |
processor = AutoProcessor.from_pretrained("google/paligemma-3b-pt-448", device_map=device) | |
model.to(device) | |
image = cv2.imread('/content/15_BC_G2_6358_40x_2_jpg.rf.97595fa4965f66ad45be8fd055331933.jpg') | |
# Convert the image to base64 encoding | |
image_bytes = cv2.imencode('.jpg', image)[1] | |
base64_string = base64.b64encode(image_bytes).decode('utf-8') | |
input_image = Image.open(io.BytesIO(image_bytes)).convert('RGB') | |
model_inputs = processor( | |
text=input_text, images=input_image, return_tensors="pt").to(device) | |
input_len = model_inputs["input_ids"].shape[-1] | |
model.to(device) | |
with torch.inference_mode(): | |
generation = model.generate( | |
**model_inputs, max_new_tokens=100, do_sample=False) | |
generation = generation[0][input_len:] | |
decoded = processor.decode(generation, skip_special_tokens=True) | |
print(decoded) |