llama3-5.4b-instruct-unhealed / scripts /test_generation.py

Upload 2 files

294f54e verified 8 months ago

1.96 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

	# Define your model path
	model_path = "./llama3-5b/hf" # or the path/model_name you have

	# Your custom quantization configuration
	quantization_config = None

	# Load the model and tokenizer
	model = AutoModelForCausalLM.from_pretrained(model_path,
	device_map="auto",
	quantization_config=quantization_config,
	output_hidden_states=True)
	tokenizer = AutoTokenizer.from_pretrained(model_path)

	# Initialize the messages list with a generic short system message
	messages = [
	{"role": "system", "content": "You are a helpful assistant."}
	]

	# Function to generate a response
	def generate_response(messages):
	input_ids = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	return_tensors="pt"
	).to(model.device)

	terminators = [
	tokenizer.eos_token_id,
	tokenizer.convert_tokens_to_ids("<\|eot_id\|>")
	]

	outputs = model.generate(
	input_ids,
	max_new_tokens=256,
	eos_token_id=terminators,
	do_sample=True,
	temperature=0.6,
	top_p=0.9,
	)
	response = outputs[0][input_ids.shape[-1]:]
	return tokenizer.decode(response, skip_special_tokens=True)

	# Interactive loop
	while True:
	# Get user input
	user_input = input("User: ")

	# Check if the user wants to quit
	if user_input.lower() == 'q':
	break

	# Update the messages list with the user input
	messages.append({"role": "user", "content": user_input})

	# Generate a response based on the updated messages
	response = generate_response(messages)
	print("Assistant:", response)

	# Update the messages list with the generated response
	messages.append({"role": "assistant", "content": response})