from transformers import AutoTokenizer, TextStreamer
from unsloth import FastLanguageModel
import torch

# Load the model and tokenizer
model_name = "Rafay17/Llama3.2_1b_customModle2"  # Use your specific model name
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=512,  # Adjust as needed
    dtype="float16",     # Adjust as needed
    load_in_4bit=True    # Adjust based on your needs
)

FastLanguageModel.for_inference(model)  # Call this immediately after loading the model

# Function to generate a response
def generate_response(input_text):
    # Prepare the labeled prompt for the model
    labeled_prompt = (
        "Please provide the response with the following labels:\n"
        "Speaker: [SPEAKER]\n"
        "Text: [TEXT]\n"
        "Sentiment: [SENTIMENT]\n"
        "Emotion: [EMOTION]\n"
        "Intent: [INTENT]\n"
        "Tone: [TONE]\n"
        "Confidence Level: [CONFIDENCE]\n"
        "Frustration Level: [FRUSTRATION]\n"
        "Response Length: [LENGTH]\n"
        "Action Required: [ACTION]\n"
        "Interruption: [INTERRUPTION]\n"
        "Cooperation Level: [COOPERATION]\n"
        "Clarity: [CLARITY]\n"
        "Objective: [OBJECTIVE]\n"
        "Timeline: [TIMELINE]\n"
        "Motivation: [MOTIVATION]\n"
        "Conversation Stage: [STAGE]\n"
        "Resolution: [RESOLUTION]\n"
        "Context: [CONTEXT]\n"
        "Urgency: [URGENCY]\n"
        "Problem Type: [PROBLEM]\n"
        "Key Words: [KEYWORDS]\n"
        "Expected Detail: [DETAIL]\n"
        "Time Gap: [TIME]\n"
        "Client Expectation: [EXPECTATION]\n"
        "Channel: [CHANNEL]\n"
        "Power Relationship: [POWER]\n\n"
        f"User Input: {input_text}\n"
        "Response:"
    )

    # Prepare the input for the model
    inputs = tokenizer(
        [labeled_prompt],
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512,  # Ensure this matches your model's max length
    ).to("cuda")

    # Set up the text streamer to stream the generated response
    text_streamer = TextStreamer(tokenizer, skip_prompt=True)

    # Generate the response
    with torch.no_grad():  # Disable gradient calculation for inference
        model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            streamer=text_streamer,
            max_new_tokens=100,  # Adjust this value as needed
            pad_token_id=tokenizer.eos_token_id,
        )

# Function to take user input and generate output
def user_interaction():
    while True:
        user_input = input("Enter conversation details (or type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            print("Exiting the program.")
            break
        print("Generating response for input:")
        generate_response(user_input)

# Start the user interaction
user_interaction()