File size: 2,966 Bytes
a3af99d 6982029 93396ac a3af99d 6982029 93396ac a3af99d 93396ac a3af99d 6982029 a3af99d 6982029 a3af99d 93396ac 6982029 93396ac 6982029 a3af99d 6982029 a3af99d 6982029 93396ac a3af99d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
from transformers import AutoTokenizer, TextStreamer
from unsloth import FastLanguageModel
import torch
# Load the model and tokenizer
model_name = "Rafay17/Llama3.2_1b_customModle2" # Use your specific model name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=512, # Adjust as needed
dtype="float16", # Adjust as needed
load_in_4bit=True # Adjust based on your needs
)
FastLanguageModel.for_inference(model) # Call this immediately after loading the model
# Function to generate a response
def generate_response(input_text):
# Prepare the labeled prompt for the model
labeled_prompt = (
"Please provide the response with the following labels:\n"
"Speaker: [SPEAKER]\n"
"Text: [TEXT]\n"
"Sentiment: [SENTIMENT]\n"
"Emotion: [EMOTION]\n"
"Intent: [INTENT]\n"
"Tone: [TONE]\n"
"Confidence Level: [CONFIDENCE]\n"
"Frustration Level: [FRUSTRATION]\n"
"Response Length: [LENGTH]\n"
"Action Required: [ACTION]\n"
"Interruption: [INTERRUPTION]\n"
"Cooperation Level: [COOPERATION]\n"
"Clarity: [CLARITY]\n"
"Objective: [OBJECTIVE]\n"
"Timeline: [TIMELINE]\n"
"Motivation: [MOTIVATION]\n"
"Conversation Stage: [STAGE]\n"
"Resolution: [RESOLUTION]\n"
"Context: [CONTEXT]\n"
"Urgency: [URGENCY]\n"
"Problem Type: [PROBLEM]\n"
"Key Words: [KEYWORDS]\n"
"Expected Detail: [DETAIL]\n"
"Time Gap: [TIME]\n"
"Client Expectation: [EXPECTATION]\n"
"Channel: [CHANNEL]\n"
"Power Relationship: [POWER]\n\n"
f"User Input: {input_text}\n"
"Response:"
)
# Prepare the input for the model
inputs = tokenizer(
[labeled_prompt],
return_tensors="pt",
padding=True,
truncation=True,
max_length=512, # Ensure this matches your model's max length
).to("cuda")
# Set up the text streamer to stream the generated response
text_streamer = TextStreamer(tokenizer, skip_prompt=True)
# Generate the response
with torch.no_grad(): # Disable gradient calculation for inference
model.generate(
input_ids=inputs.input_ids,
attention_mask=inputs.attention_mask,
streamer=text_streamer,
max_new_tokens=100, # Adjust this value as needed
pad_token_id=tokenizer.eos_token_id,
)
# Function to take user input and generate output
def user_interaction():
while True:
user_input = input("Enter conversation details (or type 'exit' to quit): ")
if user_input.lower() == 'exit':
print("Exiting the program.")
break
print("Generating response for input:")
generate_response(user_input)
# Start the user interaction
user_interaction()
|