import torch
from PIL import Image
from transformers import AutoModel, CLIPImageProcessor
import gradio as gr

# Load the model
model = AutoModel.from_pretrained(
    'OpenGVLab/InternViT-6B-448px-V1-5',
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    use_flash_attn=False  # Disable Flash Attention
).cuda().eval()

# Load the image processor
image_processor = CLIPImageProcessor.from_pretrained('OpenGVLab/InternViT-6B-448px-V1-5')

# Define the function to process the image and generate outputs
def process_image(image):
    try:
        # Convert uploaded image to RGB
        image = image.convert('RGB')
        
        # Preprocess the image
        pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
        pixel_values = pixel_values.to(torch.bfloat16).cuda()
        
        # Run the model
        outputs = model(pixel_values)
        
        # Assuming the model returns embeddings or features
        return f"Output Shape: {outputs.last_hidden_state.shape}"
    except Exception as e:
        return f"Error: {str(e)}"

# Create the Gradio interface
demo = gr.Interface(
    fn=process_image,  # Function to process the input
    inputs=gr.Image(type="pil"),  # Accepts images as input
    outputs=gr.Textbox(label="Model Output"),  # Displays model output
    title="InternViT Demo",
    description="Upload an image to process it using the InternViT model from OpenGVLab."
)

# Launch the demo
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)