from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


import gradio as gr

# Function to handle user query and return response
def chatbot_response(user_query):

    if True:
        from unsloth import FastLanguageModel
        from transformers import TextStreamer
        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "/content/drive/MyDrive/Colab Notebooks/lora_model", # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)


    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""


    inputs = tokenizer(
    [
        alpaca_prompt.format(
            "Category,Instruction General Response,'Answer the user’s query thoroughly and accurately, ensuring no details or points are omitted. Always recognize that 'AEC' refers to 'Assam Engineering College,' and vice versa, and use this understanding to provide clear, context-aware responses.' Formatting,'Structure the output to be attractive, engaging, and professional, using proper formatting. Break the response into multiple paragraphs or sections if necessary to improve readability and organization.' Use of Lists,'For queries that involve enumerations, options, or multiple steps, use bullet points or numbered lists to present the information clearly and concisely. For example: - When listing departments or facilities. - When explaining procedures or step-by-step guides. - When summarizing key features or FAQs.' Tone,'Maintain a friendly, informative tone, and deliver complete, standard answers to meet the user's expectations", # instruction
            user_query, # input
            "", # output - leave this blank for generation!
        )
    ], return_tensors = "pt").to("cuda")


    text_streamer = TextStreamer(tokenizer)

    # Generate the response
    response = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

    # Decode the response
    decoded_output = tokenizer.batch_decode(response, skip_special_tokens=False)[0]

    # Extract the desired portion
    # Find the start of the Response section
    response_start = decoded_output.find("### Response:") + len("### Response:")
    # Extract only the response part
    final_response = decoded_output[response_start:].strip()


    # Input query
    print("User Query:", user_query)  # Just for debugging, can be removed

    # --- Your model inference logic goes here ---
    # Example: Replace the following line with your model's response
    model_response = f"{final_response}"
    # ---------------------------------------------

    # Output response
    print("Model Response:", final_response)  # Just for debugging, can be removed
    return model_response

# Gradio Interface
interface = gr.Interface(
    fn=chatbot_response,  # Function for processing user input
    inputs=gr.Textbox(
        label="Enter your query:",  # Label for the input box
        placeholder="Type something...",  # Placeholder text
    ),
    outputs=gr.Textbox(label="Response:"),  # Output box for model response
    title="Simple Chatbot",
    description="This is a simple chatbot interface. Type your query and get a response.",
)

# Launch the Gradio app
interface.launch()