import gradio as gr
import transformers
import torch
from peft import PeftModel
import os

HF_TOKEN = os.environ.get("HF_TOKEN")

model_id = "JerniganLab/qa-only"
base_model = "meta-llama/Meta-Llama-3-8B-Instruct"

llama_model = transformers.AutoModelForCausalLM.from_pretrained(base_model)


pipeline = transformers.pipeline(
    "text-generation",
    model=llama_model,
    tokenizer=base_model,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",
)

pipeline.model = PeftModel.from_pretrained(llama_model, model_id)

def chat_function(message, history, system_prompt, max_new_tokens, temperature):
    messages = [{"role":"system","content":system_prompt},
                {"role":"user", "content":message}]
    prompt = pipeline.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,)
    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
    outputs = pipeline(
        prompt,
        max_new_tokens = max_new_tokens,
        eos_token_id = terminators,
        do_sample = True,
        temperature = temperature + 0.1,
        top_p = 0.9,)
    return outputs[0]["generated_text"][len(prompt):]

"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    chat_function,
    textbox=gr.Textbox(placeholder="Enter message here", container=False, scale = 7),
    chatbot=gr.Chatbot(height=400),
    additional_inputs=[
        gr.Textbox("You are helpful AI", label="System Prompt"),
        gr.Slider(500,4000, label="Max New Tokens"),
        gr.Slider(0,1, label="Temperature")
    ]
    )


if __name__ == "__main__":
    demo.launch()