import os
import base64
import torch
import gradio as gr
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import login
 
# Step 1: Log in to Hugging Face
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token)
 
# Step 2: Load the private model and processor
model_name = "anushettypsl/paligemma_vqav2"  # Replace with the actual model link
processor = AutoProcessor.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)
 
# Step 3: Set up PEFT configuration (if needed)
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout
    task_type=TaskType.CAUSAL_LM,  # Adjust according to your model's task
)
 
# Step 4: Get the PEFT model
peft_model = get_peft_model(base_model, lora_config)
 
# Step 5: Define the prediction function
def predict(image_base64, prompt):
    # Decode the base64 image
    image_data = base64.b64decode(image_base64)
    image = Image.open(io.BytesIO(image_data))
 
    # Process the image
    inputs = processor( text=prompt,images=image, return_tensors="pt")
  
    # Generate output using the model
    with torch.no_grad():
        output = peft_model.generate(**inputs)
    
    # Decode the output to text
    generated_text = processor.decode(output[0], skip_special_tokens=True)
    return generated_text
 
# Step 6: Create the Gradio interface
interface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(label="Image (Base64)", placeholder="Enter base64 encoded image here...", lines=10),  # Base64 input for image
        gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")  # Prompt input
    ],
    outputs="text",  # Text output
    title="Image and Prompt to Text Model",
    description="Enter a base64 encoded image and a prompt to generate a descriptive text."
)
 
# Step 7: Launch the Gradio app
interface.launch()