import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModel
from safetensors.torch import load_file

# Load the Hugging Face API token from environment variable
token = os.getenv("HUGGINGFACE_API_TOKEN")
if not token:
    raise ValueError("HUGGINGFACE_API_TOKEN is not set. Please add it in the Secrets section of your Space.")

# Configure device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the tokenizer and model using the token
model_repo = "Grandediw/lora_model"
tokenizer = AutoTokenizer.from_pretrained(model_repo, token=token)
base_model = AutoModel.from_pretrained(model_repo, token=token)

# Load the LoRA adapter weights
lora_weights_path = "adapter_model.safetensors"  
lora_weights = load_file(lora_weights_path)

# Apply LoRA weights to the base model
for name, param in base_model.named_parameters():
    if name in lora_weights:
        param.data += lora_weights[name].to(device, dtype=param.dtype)

# Move the model to the device
base_model = base_model.to(device)

# Define the inference function
def infer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = base_model(**inputs)
    # Placeholder return, modify based on your specific model task
    return outputs.last_hidden_state.mean(dim=1).cpu().detach().numpy()

# Gradio interface Update
with gr.Blocks() as demo:
    gr.Markdown("## LoRA Model Inference")

    with gr.Row():
        prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt here...")
        generate_button = gr.Button("Generate")

    output = gr.Textbox(label="Output")

    generate_button.click(fn=infer, inputs=[prompt], outputs=[output])

if __name__ == "__main__":
    demo.launch()