from transformers import AutoTokenizer, TextStreamer from unsloth import FastLanguageModel import torch # Load the model and tokenizer model_name = "Rafay17/Llama3.2_1b_customModle2" # Use your specific model name tokenizer = AutoTokenizer.from_pretrained(model_name) model = FastLanguageModel.from_pretrained( model_name=model_name, max_seq_length=512, # Adjust as needed dtype="float16", # Adjust as needed load_in_4bit=True # Adjust based on your needs ) FastLanguageModel.for_inference(model) # Call this immediately after loading the model # Function to generate a response def generate_response(input_text): # Prepare the labeled prompt for the model labeled_prompt = ( "Please provide the response with the following labels:\n" "Speaker: [SPEAKER]\n" "Text: [TEXT]\n" "Sentiment: [SENTIMENT]\n" "Emotion: [EMOTION]\n" "Intent: [INTENT]\n" "Tone: [TONE]\n" "Confidence Level: [CONFIDENCE]\n" "Frustration Level: [FRUSTRATION]\n" "Response Length: [LENGTH]\n" "Action Required: [ACTION]\n" "Interruption: [INTERRUPTION]\n" "Cooperation Level: [COOPERATION]\n" "Clarity: [CLARITY]\n" "Objective: [OBJECTIVE]\n" "Timeline: [TIMELINE]\n" "Motivation: [MOTIVATION]\n" "Conversation Stage: [STAGE]\n" "Resolution: [RESOLUTION]\n" "Context: [CONTEXT]\n" "Urgency: [URGENCY]\n" "Problem Type: [PROBLEM]\n" "Key Words: [KEYWORDS]\n" "Expected Detail: [DETAIL]\n" "Time Gap: [TIME]\n" "Client Expectation: [EXPECTATION]\n" "Channel: [CHANNEL]\n" "Power Relationship: [POWER]\n\n" f"User Input: {input_text}\n" "Response:" ) # Prepare the input for the model inputs = tokenizer( [labeled_prompt], return_tensors="pt", padding=True, truncation=True, max_length=512, # Ensure this matches your model's max length ).to("cuda") # Set up the text streamer to stream the generated response text_streamer = TextStreamer(tokenizer, skip_prompt=True) # Generate the response with torch.no_grad(): # Disable gradient calculation for inference model.generate( input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, streamer=text_streamer, max_new_tokens=100, # Adjust this value as needed pad_token_id=tokenizer.eos_token_id, ) # Function to take user input and generate output def user_interaction(): while True: user_input = input("Enter conversation details (or type 'exit' to quit): ") if user_input.lower() == 'exit': print("Exiting the program.") break print("Generating response for input:") generate_response(user_input) # Start the user interaction user_interaction()